diff mbox

[API-NEXT,PATCHv7,2/5] linux-generic: packet: implement reference apis

Message ID 20170111023359.30597-2-bill.fischofer@linaro.org
State New
Headers show

Commit Message

Bill Fischofer Jan. 11, 2017, 2:33 a.m. UTC
Implement the APIs:
- odp_packet_ref_static()
- odp_packet_ref()
- odp_packet_ref_pkt()
- odp_packet_has_ref()
- odp_packet_unshared_len()

This also involves functional upgrades to the existing packet manipulation
APIs to work with packet references as input arguments.

Signed-off-by: Bill Fischofer <bill.fischofer@linaro.org>

---
 .../linux-generic/include/odp_packet_internal.h    |  87 +++-
 platform/linux-generic/odp_packet.c                | 536 +++++++++++++++++----
 2 files changed, 516 insertions(+), 107 deletions(-)

-- 
2.9.3

Comments

Peltonen, Janne (Nokia - FI/Espoo) Feb. 17, 2017, 5:31 p.m. UTC | #1
Hi,

I took a look at the packet references and it seems to me that
either the implementation is a bit racy or I confused myself
when reading the code. Or maybe I got the intended concurrency
semantics of the packet references wrong?

My first issue is that packet_free() may access freed packet
header or corrupt unshared_len.

The packet free function looks like this:

static inline void packet_free(odp_packet_hdr_t *pkt_hdr)
{
        odp_packet_hdr_t *ref_hdr;
        uint32_t ref_count;

        do {
                ref_hdr = pkt_hdr->ref_hdr;
                ref_count = packet_ref_count(pkt_hdr) - 1;
                free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

                if (ref_count == 1)
                        pkt_hdr->unshared_len = pkt_hdr->frame_len;

                pkt_hdr = ref_hdr;
        } while (pkt_hdr);
}

The problem here is that decrementing the ref_count, checking
its value and updating unshared_len is not single atomic
operation. By the time packet_free() checks if ref_count == 1
(i.e. if there is exactly one other reference left somewhere),
the true ref_count may have already been changed by another
thread doing packet_free() or packet_ref().

For example, if two threads have a reference to the same packet
then execution (or the relevant memory ops) may get "interleaved"
as follows:

T1: call packet_free()
T1: ref_count = packet_ref_count(pkt_hdr) - 1;
At this point ref_count variable is 1
T1: call free_bufs()
T1: call packet_ref_dec()
Now the ref_count of the packet header is 1.
T2: call and complete packet_free()
Thread 2 sees refcount 1 in the packet and frees the buffers
T1: pkt_hdr->unshared_len = pkt_hdr->frame_len;
Thread 1 accesses freed buffer for reading and writing.

Similarly, if T2 created a new reference, T1 would have
a wrong idea of the number of remaining references and
would adjust the unshared_len to an incorrect value.

Right?

Maybe other modifications of unshared_len are also racy.



The second issue is that the atomic ops for setting and
reading the ref count seem to have too relaxed memory
ordering. In particular, packet_ref_dec() must not happen
(be visible to other threads) before its caller is done
with the packet and the related memory accesses have
completed. Now there does not seem to be any optimization
and memory barrier to prevent the ref count decrementing
happening too early. So I think it is at least theoretically
possible that a thread e.g. reads from a packet buffer
after it has already been freed by another thread, somehow
like this:

Source code order:
T1: interesting_data = read_from_pkt(pkt)
T1: packet_free(pkt)

Order visible to T2:
1: ref count decr
2: read from pkt

Now if T2 goes and frees the remaining reference between
steps 1 and 2, T1 may get even more interesting data.

Right?

	Janne


> -----Original Message-----

> From: lng-odp [mailto:lng-odp-bounces@lists.linaro.org] On Behalf Of Bill Fischofer

> Sent: Wednesday, January 11, 2017 4:34 AM

> To: lng-odp@lists.linaro.org

> Subject: [lng-odp] [API-NEXT PATCHv7 2/5] linux-generic: packet: implement reference apis

> 

> Implement the APIs:

> - odp_packet_ref_static()

> - odp_packet_ref()

> - odp_packet_ref_pkt()

> - odp_packet_has_ref()

> - odp_packet_unshared_len()

> 

> This also involves functional upgrades to the existing packet manipulation

> APIs to work with packet references as input arguments.

> 

> Signed-off-by: Bill Fischofer <bill.fischofer@linaro.org>

> ---

>  .../linux-generic/include/odp_packet_internal.h    |  87 +++-

>  platform/linux-generic/odp_packet.c                | 536 +++++++++++++++++----

>  2 files changed, 516 insertions(+), 107 deletions(-)

> 

> diff --git a/platform/linux-generic/include/odp_packet_internal.h b/platform/linux-

> generic/include/odp_packet_internal.h

> index e6e9d74..607560d 100644

> --- a/platform/linux-generic/include/odp_packet_internal.h

> +++ b/platform/linux-generic/include/odp_packet_internal.h

> @@ -19,6 +19,7 @@ extern "C" {

> 

>  #include <odp/api/align.h>

>  #include <odp/api/debug.h>

> +#include <odp_debug_internal.h>

>  #include <odp_buffer_internal.h>

>  #include <odp_pool_internal.h>

>  #include <odp_buffer_inlines.h>

> @@ -168,7 +169,7 @@ typedef struct {

>   * packet_init(). Because of this any new fields added must be reviewed for

>   * initialization requirements.

>   */

> -typedef struct {

> +typedef struct odp_packet_hdr_t {

>  	/* common buffer header */

>  	odp_buffer_hdr_t buf_hdr;

> 

> @@ -184,6 +185,13 @@ typedef struct {

>  	uint32_t headroom;

>  	uint32_t tailroom;

> 

> +	/* Fields used to support packet references */

> +	uint32_t unshared_len;

> +	struct odp_packet_hdr_t *ref_hdr;

> +	uint32_t ref_offset;

> +	uint32_t ref_len;

> +	odp_atomic_u32_t ref_count;

> +

>  	/*

>  	 * Members below are not initialized by packet_init()

>  	 */

> @@ -212,6 +220,55 @@ static inline odp_packet_hdr_t *odp_packet_hdr(odp_packet_t pkt)

>  	return (odp_packet_hdr_t *)buf_hdl_to_hdr((odp_buffer_t)pkt);

>  }

> 

> +static inline odp_packet_hdr_t *odp_packet_last_hdr(odp_packet_t pkt,

> +						    uint32_t *offset)

> +{

> +	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> +	odp_packet_hdr_t *prev_hdr = pkt_hdr;

> +	uint32_t ref_offset = 0;

> +

> +	while (pkt_hdr->ref_hdr) {

> +		ref_offset = pkt_hdr->ref_offset;

> +		prev_hdr   = pkt_hdr;

> +		pkt_hdr    = pkt_hdr->ref_hdr;

> +	}

> +

> +	if (offset) {

> +		if (prev_hdr != pkt_hdr)

> +			ref_offset += pkt_hdr->frame_len - prev_hdr->ref_len;

> +		*offset = ref_offset;

> +	}

> +

> +	return pkt_hdr;

> +}

> +

> +static inline odp_packet_hdr_t *odp_packet_prev_hdr(odp_packet_hdr_t *pkt_hdr,

> +						    odp_packet_hdr_t *cur_hdr,

> +						    uint32_t *offset)

> +{

> +	uint32_t ref_offset = 0;

> +	odp_packet_hdr_t *prev_hdr = pkt_hdr;

> +

> +	while (pkt_hdr->ref_hdr != cur_hdr) {

> +		ref_offset = pkt_hdr->ref_offset;

> +		prev_hdr   = pkt_hdr;

> +		pkt_hdr    = pkt_hdr->ref_hdr;

> +	}

> +

> +	if (offset) {

> +		if (prev_hdr != pkt_hdr)

> +			ref_offset += pkt_hdr->frame_len - prev_hdr->ref_len;

> +		*offset = ref_offset;

> +	}

> +

> +	return pkt_hdr;

> +}

> +

> +static inline odp_packet_t _odp_packet_hdl(odp_packet_hdr_t *pkt_hdr)

> +{

> +	return (odp_packet_t)odp_hdr_to_buf(&pkt_hdr->buf_hdr);

> +}

> +

>  static inline void copy_packet_parser_metadata(odp_packet_hdr_t *src_hdr,

>  					       odp_packet_hdr_t *dst_hdr)

>  {

> @@ -234,17 +291,43 @@ static inline void pull_tail(odp_packet_hdr_t *pkt_hdr, uint32_t

> len)

> 

>  	pkt_hdr->tailroom  += len;

>  	pkt_hdr->frame_len -= len;

> +	pkt_hdr->unshared_len -= len;

>  	pkt_hdr->buf_hdr.seg[last].len -= len;

>  }

> 

>  static inline uint32_t packet_len(odp_packet_hdr_t *pkt_hdr)

>  {

> -	return pkt_hdr->frame_len;

> +	uint32_t pkt_len = 0;

> +	uint32_t offset  = 0;

> +

> +	do {

> +		pkt_len += pkt_hdr->frame_len - offset;

> +		offset   = pkt_hdr->ref_offset;

> +		if (pkt_hdr->ref_hdr)

> +			offset += (pkt_hdr->ref_hdr->frame_len -

> +				   pkt_hdr->ref_len);

> +		pkt_hdr  = pkt_hdr->ref_hdr;

> +	} while (pkt_hdr);

> +

> +	return pkt_len;

> +}

> +

> +static inline uint32_t packet_ref_count(odp_packet_hdr_t *pkt_hdr)

> +{

> +	return odp_atomic_load_u32(&pkt_hdr->ref_count);

> +}

> +

> +static inline void packet_ref_count_set(odp_packet_hdr_t *pkt_hdr, uint32_t n)

> +{

> +	odp_atomic_init_u32(&pkt_hdr->ref_count, n);

>  }

> 

>  static inline void packet_set_len(odp_packet_hdr_t *pkt_hdr, uint32_t len)

>  {

> +	ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

> +

>  	pkt_hdr->frame_len = len;

> +	pkt_hdr->unshared_len = len;

>  }

> 

>  static inline int packet_parse_l2_not_done(packet_parser_t *prs)

> diff --git a/platform/linux-generic/odp_packet.c b/platform/linux-generic/odp_packet.c

> index f632a51..170965a 100644

> --- a/platform/linux-generic/odp_packet.c

> +++ b/platform/linux-generic/odp_packet.c

> @@ -33,13 +33,24 @@ static inline odp_buffer_t buffer_handle(odp_packet_hdr_t *pkt_hdr)

>  	return pkt_hdr->buf_hdr.handle.handle;

>  }

> 

> +static inline uint32_t packet_ref_inc(odp_packet_hdr_t *pkt_hdr)

> +{

> +	return odp_atomic_fetch_inc_u32(&pkt_hdr->ref_count);

> +}

> +

> +static inline uint32_t packet_ref_dec(odp_packet_hdr_t *pkt_hdr)

> +{

> +	return odp_atomic_fetch_dec_u32(&pkt_hdr->ref_count);

> +}

> +

>  static inline uint32_t packet_seg_len(odp_packet_hdr_t *pkt_hdr,

>  				      uint32_t seg_idx)

>  {

>  	return pkt_hdr->buf_hdr.seg[seg_idx].len;

>  }

> 

> -static inline void *packet_seg_data(odp_packet_hdr_t *pkt_hdr, uint32_t seg_idx)

> +static inline uint8_t *packet_seg_data(odp_packet_hdr_t *pkt_hdr,

> +				       uint32_t seg_idx)

>  {

>  	return pkt_hdr->buf_hdr.seg[seg_idx].data;

>  }

> @@ -52,6 +63,11 @@ static inline int packet_last_seg(odp_packet_hdr_t *pkt_hdr)

>  		return pkt_hdr->buf_hdr.segcount - 1;

>  }

> 

> +static inline void *packet_data(odp_packet_hdr_t *pkt_hdr)

> +{

> +	return pkt_hdr->buf_hdr.seg[0].data;

> +}

> +

>  static inline uint32_t packet_first_seg_len(odp_packet_hdr_t *pkt_hdr)

>  {

>  	return packet_seg_len(pkt_hdr, 0);

> @@ -64,11 +80,6 @@ static inline uint32_t packet_last_seg_len(odp_packet_hdr_t *pkt_hdr)

>  	return packet_seg_len(pkt_hdr, last);

>  }

> 

> -static inline void *packet_data(odp_packet_hdr_t *pkt_hdr)

> -{

> -	return pkt_hdr->buf_hdr.seg[0].data;

> -}

> -

>  static inline void *packet_tail(odp_packet_hdr_t *pkt_hdr)

>  {

>  	int last = packet_last_seg(pkt_hdr);

> @@ -99,6 +110,7 @@ static inline void push_head(odp_packet_hdr_t *pkt_hdr, uint32_t len)

>  {

>  	pkt_hdr->headroom  -= len;

>  	pkt_hdr->frame_len += len;

> +	pkt_hdr->unshared_len += len;

>  	pkt_hdr->buf_hdr.seg[0].data -= len;

>  	pkt_hdr->buf_hdr.seg[0].len  += len;

>  }

> @@ -107,6 +119,7 @@ static inline void pull_head(odp_packet_hdr_t *pkt_hdr, uint32_t len)

>  {

>  	pkt_hdr->headroom  += len;

>  	pkt_hdr->frame_len -= len;

> +	pkt_hdr->unshared_len -= len;

>  	pkt_hdr->buf_hdr.seg[0].data += len;

>  	pkt_hdr->buf_hdr.seg[0].len  -= len;

>  }

> @@ -117,6 +130,7 @@ static inline void push_tail(odp_packet_hdr_t *pkt_hdr, uint32_t len)

> 

>  	pkt_hdr->tailroom  -= len;

>  	pkt_hdr->frame_len += len;

> +	pkt_hdr->unshared_len += len;

>  	pkt_hdr->buf_hdr.seg[last].len += len;

>  }

> 

> @@ -144,6 +158,10 @@ static inline void packet_seg_copy_md(odp_packet_hdr_t *dst,

>  	dst->buf_hdr.uarea_addr = src->buf_hdr.uarea_addr;

>  	dst->buf_hdr.uarea_size = src->buf_hdr.uarea_size;

> 

> +	/* reference related metadata */

> +	dst->ref_len      = src->ref_len;

> +	dst->unshared_len = src->unshared_len;

> +

>  	/* segmentation data is not copied:

>  	 *   buf_hdr.seg[]

>  	 *   buf_hdr.segcount

> @@ -158,7 +176,15 @@ static inline void *packet_map(odp_packet_hdr_t *pkt_hdr,

>  	int seg = 0;

>  	int seg_count = pkt_hdr->buf_hdr.segcount;

> 

> -	if (odp_unlikely(offset >= pkt_hdr->frame_len))

> +	/* Special processing for references */

> +	while (offset >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

> +		offset   -= (pkt_hdr->frame_len - pkt_hdr->ref_offset);

> +		offset   += (pkt_hdr->ref_hdr->frame_len - pkt_hdr->ref_len);

> +		pkt_hdr   = pkt_hdr->ref_hdr;

> +		seg_count = pkt_hdr->buf_hdr.segcount;

> +	}

> +

> +	if (odp_unlikely(offset > pkt_hdr->frame_len))

>  		return NULL;

> 

>  	if (odp_likely(CONFIG_PACKET_MAX_SEGS == 1 || seg_count == 1)) {

> @@ -207,6 +233,9 @@ void packet_parse_reset(odp_packet_hdr_t *pkt_hdr)

>  	pkt_hdr->p.l2_offset        = 0;

>  	pkt_hdr->p.l3_offset        = ODP_PACKET_OFFSET_INVALID;

>  	pkt_hdr->p.l4_offset        = ODP_PACKET_OFFSET_INVALID;

> +

> +	/* Ensure dummy pkt_hdrs used in I/O recv classification are valid */

> +	pkt_hdr->ref_hdr = NULL;

>  }

> 

>  /**

> @@ -252,6 +281,10 @@ static inline void packet_init(odp_packet_hdr_t *pkt_hdr, uint32_t

> len,

>  			     CONFIG_PACKET_TAILROOM;

> 

>  	pkt_hdr->input = ODP_PKTIO_INVALID;

> +

> +	/* By default packet has no references */

> +	pkt_hdr->unshared_len = len;

> +	pkt_hdr->ref_hdr = NULL;

>  }

> 

>  static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int num)

> @@ -264,6 +297,7 @@ static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int num)

> 

>  	hdr->buf_hdr.seg[0].data = hdr->buf_hdr.base_data;

>  	hdr->buf_hdr.seg[0].len  = BASE_LEN;

> +	packet_ref_count_set(hdr, 1);

> 

>  	/* Link segments */

>  	if (CONFIG_PACKET_MAX_SEGS != 1) {

> @@ -273,6 +307,7 @@ static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int num)

>  			for (i = 1; i < num; i++) {

>  				odp_buffer_hdr_t *buf_hdr;

> 

> +				packet_ref_count_set(pkt_hdr[i], 1);

>  				buf_hdr = &pkt_hdr[i]->buf_hdr;

>  				hdr->buf_hdr.seg[i].hdr  = buf_hdr;

>  				hdr->buf_hdr.seg[i].data = buf_hdr->base_data;

> @@ -376,9 +411,10 @@ static inline odp_packet_hdr_t *add_segments(odp_packet_hdr_t

> *pkt_hdr,

>  		new_hdr->buf_hdr.seg[0].len   = seg_len;

> 

>  		packet_seg_copy_md(new_hdr, pkt_hdr);

> -		new_hdr->frame_len = pkt_hdr->frame_len + len;

> -		new_hdr->headroom  = pool->headroom + offset;

> -		new_hdr->tailroom  = pkt_hdr->tailroom;

> +		new_hdr->frame_len    = pkt_hdr->frame_len + len;

> +		new_hdr->unshared_len = pkt_hdr->unshared_len + len;

> +		new_hdr->headroom     = pool->headroom + offset;

> +		new_hdr->tailroom     = pkt_hdr->tailroom;

> 

>  		pkt_hdr = new_hdr;

>  	} else {

> @@ -391,8 +427,9 @@ static inline odp_packet_hdr_t *add_segments(odp_packet_hdr_t

> *pkt_hdr,

>  		last = packet_last_seg(pkt_hdr);

>  		pkt_hdr->buf_hdr.seg[last].len = seg_len;

> 

> -		pkt_hdr->frame_len += len;

> -		pkt_hdr->tailroom   = pool->tailroom + offset;

> +		pkt_hdr->frame_len    += len;

> +		pkt_hdr->unshared_len += len;

> +		pkt_hdr->tailroom      = pool->tailroom + offset;

>  	}

> 

>  	return pkt_hdr;

> @@ -400,13 +437,18 @@ static inline odp_packet_hdr_t *add_segments(odp_packet_hdr_t

> *pkt_hdr,

> 

>  static inline void free_bufs(odp_packet_hdr_t *pkt_hdr, int first, int num)

>  {

> -	int i;

> +	int i, nfree;

>  	odp_buffer_t buf[num];

> 

> -	for (i = 0; i < num; i++)

> -		buf[i] = buffer_handle(pkt_hdr->buf_hdr.seg[first + i].hdr);

> +	for (i = 0, nfree = 0; i < num; i++) {

> +		odp_packet_hdr_t *hdr = pkt_hdr->buf_hdr.seg[first + i].hdr;

> +

> +		if (packet_ref_dec(hdr) == 1)

> +			buf[nfree++] = buffer_handle(hdr);

> +	}

> 

> -	buffer_free_multi(buf, num);

> +	if (nfree > 0)

> +		buffer_free_multi(buf, nfree);

>  }

> 

>  static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t *pkt_hdr,

> @@ -417,11 +459,15 @@ static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t

> *pkt_hdr,

> 

>  	if (head) {

>  		odp_packet_hdr_t *new_hdr;

> -		int i;

> +		int i, nfree;

>  		odp_buffer_t buf[num];

> 

> -		for (i = 0; i < num; i++)

> -			buf[i] = buffer_handle(pkt_hdr->buf_hdr.seg[i].hdr);

> +		for (i = 0, nfree = 0; i < num; i++) {

> +			new_hdr = pkt_hdr->buf_hdr.seg[i].hdr;

> +

> +			if (packet_ref_dec(new_hdr) == 1)

> +				buf[nfree++] = buffer_handle(new_hdr);

> +		}

> 

>  		/* First remaining segment is the new packet descriptor */

>  		new_hdr = pkt_hdr->buf_hdr.seg[num].hdr;

> @@ -430,15 +476,17 @@ static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t

> *pkt_hdr,

>  		packet_seg_copy_md(new_hdr, pkt_hdr);

> 

>  		/* Tailroom not changed */

> -		new_hdr->tailroom  = pkt_hdr->tailroom;

> -		new_hdr->headroom  = seg_headroom(new_hdr, 0);

> -		new_hdr->frame_len = pkt_hdr->frame_len - free_len;

> +		new_hdr->tailroom     = pkt_hdr->tailroom;

> +		new_hdr->headroom     = seg_headroom(new_hdr, 0);

> +		new_hdr->frame_len    = pkt_hdr->frame_len - free_len;

> +		new_hdr->unshared_len = pkt_hdr->unshared_len - free_len;

> 

>  		pull_head(new_hdr, pull_len);

> 

>  		pkt_hdr = new_hdr;

> 

> -		buffer_free_multi(buf, num);

> +		if (nfree > 0)

> +			buffer_free_multi(buf, nfree);

>  	} else {

>  		/* Free last 'num' bufs */

>  		free_bufs(pkt_hdr, num_remain, num);

> @@ -447,6 +495,7 @@ static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t

> *pkt_hdr,

>  		 * of the metadata. */

>  		pkt_hdr->buf_hdr.segcount = num_remain;

>  		pkt_hdr->frame_len -= free_len;

> +		pkt_hdr->unshared_len -= free_len;

>  		pkt_hdr->tailroom = seg_tailroom(pkt_hdr, num_remain - 1);

> 

>  		pull_tail(pkt_hdr, pull_len);

> @@ -550,45 +599,34 @@ int odp_packet_alloc_multi(odp_pool_t pool_hdl, uint32_t len,

>  	return num;

>  }

> 

> -void odp_packet_free(odp_packet_t pkt)

> +static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

>  {

> -	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> -	int num_seg = pkt_hdr->buf_hdr.segcount;

> +	odp_packet_hdr_t *ref_hdr;

> +	uint32_t ref_count;

> 

> -	if (odp_likely(CONFIG_PACKET_MAX_SEGS == 1 || num_seg == 1))

> -		buffer_free_multi((odp_buffer_t *)&pkt, 1);

> -	else

> -		free_bufs(pkt_hdr, 0, num_seg);

> -}

> +	do {

> +		ref_hdr = pkt_hdr->ref_hdr;

> +		ref_count = packet_ref_count(pkt_hdr) - 1;

> +		free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

> 

> -void odp_packet_free_multi(const odp_packet_t pkt[], int num)

> -{

> -	if (CONFIG_PACKET_MAX_SEGS == 1) {

> -		buffer_free_multi((const odp_buffer_t * const)pkt, num);

> -	} else {

> -		odp_buffer_t buf[num * CONFIG_PACKET_MAX_SEGS];

> -		int i, j;

> -		int bufs = 0;

> +		if (ref_count == 1)

> +			pkt_hdr->unshared_len = pkt_hdr->frame_len;

> 

> -		for (i = 0; i < num; i++) {

> -			odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt[i]);

> -			int num_seg = pkt_hdr->buf_hdr.segcount;

> -			odp_buffer_hdr_t *buf_hdr = &pkt_hdr->buf_hdr;

> -

> -			buf[bufs] = (odp_buffer_t)pkt[i];

> -			bufs++;

> +		pkt_hdr = ref_hdr;

> +	} while (pkt_hdr);

> +}

> 

> -			if (odp_likely(num_seg == 1))

> -				continue;

> +void odp_packet_free(odp_packet_t pkt)

> +{

> +	packet_free(odp_packet_hdr(pkt));

> +}

> 

> -			for (j = 1; j < num_seg; j++) {

> -				buf[bufs] = buffer_handle(buf_hdr->seg[j].hdr);

> -				bufs++;

> -			}

> -		}

> +void odp_packet_free_multi(const odp_packet_t pkt[], int num)

> +{

> +	int i;

> 

> -		buffer_free_multi(buf, bufs);

> -	}

> +	for (i = 0; i < num; i++)

> +		packet_free(odp_packet_hdr(pkt[i]));

>  }

> 

>  int odp_packet_reset(odp_packet_t pkt, uint32_t len)

> @@ -599,6 +637,9 @@ int odp_packet_reset(odp_packet_t pkt, uint32_t len)

>  	if (len > pool->headroom + pool->data_size + pool->tailroom)

>  		return -1;

> 

> +	if (pkt_hdr->ref_hdr)

> +		packet_free(pkt_hdr->ref_hdr);

> +

>  	packet_init(pkt_hdr, len, 0);

> 

>  	return 0;

> @@ -641,15 +682,21 @@ void *odp_packet_head(odp_packet_t pkt)

>  uint32_t odp_packet_buf_len(odp_packet_t pkt)

>  {

>  	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> +	uint32_t buf_len = 0;

> 

> -	return pkt_hdr->buf_hdr.size * pkt_hdr->buf_hdr.segcount;

> +	do {

> +		buf_len += pkt_hdr->buf_hdr.size * pkt_hdr->buf_hdr.segcount;

> +		pkt_hdr  = pkt_hdr->ref_hdr;

> +	} while (pkt_hdr);

> +

> +	return buf_len;

>  }

> 

>  void *odp_packet_data(odp_packet_t pkt)

>  {

>  	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> 

> -	return packet_data(pkt_hdr);

> +	return packet_map(pkt_hdr, 0, NULL, NULL);

>  }

> 

>  uint32_t odp_packet_seg_len(odp_packet_t pkt)

> @@ -661,7 +708,32 @@ uint32_t odp_packet_seg_len(odp_packet_t pkt)

> 

>  uint32_t odp_packet_len(odp_packet_t pkt)

>  {

> -	return odp_packet_hdr(pkt)->frame_len;

> +	return packet_len(odp_packet_hdr(pkt));

> +}

> +

> +uint32_t odp_packet_unshared_len(odp_packet_t pkt)

> +{

> +	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> +	uint32_t pkt_len = 0, offset = 0;

> +

> +	do {

> +		if (packet_ref_count(pkt_hdr) > 1) {

> +			if (offset == 0)

> +				pkt_len += pkt_hdr->unshared_len;

> +			break;

> +		}

> +

> +		pkt_len += pkt_hdr->frame_len - offset;

> +		offset   = pkt_hdr->ref_offset;

> +

> +		if (pkt_hdr->ref_hdr)

> +			offset += (pkt_hdr->ref_hdr->frame_len -

> +				   pkt_hdr->ref_len);

> +

> +		pkt_hdr = pkt_hdr->ref_hdr;

> +	} while (pkt_hdr);

> +

> +	return pkt_len;

>  }

> 

>  uint32_t odp_packet_headroom(odp_packet_t pkt)

> @@ -671,12 +743,12 @@ uint32_t odp_packet_headroom(odp_packet_t pkt)

> 

>  uint32_t odp_packet_tailroom(odp_packet_t pkt)

>  {

> -	return odp_packet_hdr(pkt)->tailroom;

> +	return odp_packet_last_hdr(pkt, NULL)->tailroom;

>  }

> 

>  void *odp_packet_tail(odp_packet_t pkt)

>  {

> -	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> +	odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(pkt, NULL);

> 

>  	return packet_tail(pkt_hdr);

>  }

> @@ -870,7 +942,7 @@ int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,

>  {

>  	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

>  	uint32_t frame_len = pkt_hdr->frame_len;

> -	uint32_t headroom  = pkt_hdr->headroom;

> +	uint32_t headroom = pkt_hdr->headroom;

>  	int ret = 0;

> 

>  	if (len > headroom) {

> @@ -885,6 +957,46 @@ int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,

>  		segs = pkt_hdr->buf_hdr.segcount;

> 

>  		if (odp_unlikely((segs + num) > CONFIG_PACKET_MAX_SEGS)) {

> +			/* Handle recursively via references when

> +			 * working with referenced packets since another

> +			 * thread may be accessing it concurrently via

> +			 * its reference to it. */

> +			if (packet_ref_count(pkt_hdr) > 1) {

> +				odp_packet_t ref;

> +				uint32_t unshared_len;

> +

> +				push_head(pkt_hdr, headroom);

> +				unshared_len = pkt_hdr->unshared_len;

> +				ref = odp_packet_ref(*pkt, 0);

> +

> +				if (ref == ODP_PACKET_INVALID) {

> +					pull_head(pkt_hdr, headroom);

> +					return -1;

> +				}

> +

> +				ret = odp_packet_extend_head(&ref,

> +							     len - headroom,

> +							     data_ptr,

> +							     seg_len);

> +

> +				if (ret < 0) {

> +					odp_packet_free(ref);

> +					pull_head(pkt_hdr, headroom);

> +					return -1;

> +				}

> +

> +				/* Since this is a special ref, the

> +				 * base pkt's unshared len is unchanged */

> +				pkt_hdr->unshared_len = unshared_len;

> +

> +				/* Remove extra ref to the base pkt */

> +				odp_packet_free(*pkt);

> +

> +				/* Return the ref as the extension result */

> +				*pkt = ref;

> +				return 1;

> +			}

> +

>  			/* Cannot directly add new segments */

>  			odp_packet_hdr_t *new_hdr;

>  			int new_segs = 0;

> @@ -936,6 +1048,7 @@ int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,

> 

>  			pkt_hdr->buf_hdr.segcount = segs;

>  			pkt_hdr->frame_len        = frame_len;

> +			pkt_hdr->unshared_len     = frame_len;

>  			pkt_hdr->headroom         = offset + pool->headroom;

>  			pkt_hdr->tailroom         = pool->tailroom;

> 

> @@ -961,11 +1074,16 @@ int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,

>  		push_head(pkt_hdr, len);

>  	}

> 

> -	if (data_ptr)

> -		*data_ptr = packet_data(pkt_hdr);

> +	if (data_ptr || seg_len) {

> +		uint32_t seg_ln = 0;

> +		void *data = packet_map(pkt_hdr, 0, &seg_ln, NULL);

> 

> -	if (seg_len)

> -		*seg_len = packet_first_seg_len(pkt_hdr);

> +		if (data_ptr)

> +			*data_ptr = data;

> +

> +		if (seg_len)

> +			*seg_len = seg_ln;

> +	}

> 

>  	return ret;

>  }

> @@ -977,6 +1095,8 @@ void *odp_packet_pull_head(odp_packet_t pkt, uint32_t len)

>  	if (len > pkt_hdr->frame_len)

>  		return NULL;

> 

> +	ODP_ASSERT(len <= pkt_hdr->unshared_len);

> +

>  	pull_head(pkt_hdr, len);

>  	return packet_data(pkt_hdr);

>  }

> @@ -984,15 +1104,35 @@ void *odp_packet_pull_head(odp_packet_t pkt, uint32_t len)

>  int odp_packet_trunc_head(odp_packet_t *pkt, uint32_t len,

>  			  void **data_ptr, uint32_t *seg_len_out)

>  {

> -	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

> +	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt), *nxt_hdr;

>  	uint32_t seg_len = packet_first_seg_len(pkt_hdr);

> +	int ret = 0;

> 

> -	if (len > pkt_hdr->frame_len)

> +	if (len > packet_len(pkt_hdr))

>  		return -1;

> 

> -	if (len < seg_len) {

> +	ODP_ASSERT(len <= odp_packet_unshared_len(*pkt));

> +

> +	/* Special processing for references */

> +	while (len >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

> +		ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

> +		nxt_hdr = pkt_hdr->ref_hdr;

> +		len -= pkt_hdr->frame_len;

> +		len += pkt_hdr->ref_offset +

> +			(nxt_hdr->frame_len - pkt_hdr->ref_len);

> +		pkt_hdr->ref_hdr = NULL;

> +		packet_free(pkt_hdr);

> +		pkt_hdr = nxt_hdr;

> +		seg_len = packet_first_seg_len(pkt_hdr);

> +		*pkt = packet_handle(pkt_hdr);

> +		ret = 1;

> +	}

> +

> +	if (CONFIG_PACKET_MAX_SEGS == 1 ||

> +	    len < seg_len ||

> +	    pkt_hdr->buf_hdr.segcount == 1) {

>  		pull_head(pkt_hdr, len);

> -	} else if (CONFIG_PACKET_MAX_SEGS != 1) {

> +	} else {

>  		int num = 0;

>  		uint32_t pull_len = 0;

> 

> @@ -1007,23 +1147,29 @@ int odp_packet_trunc_head(odp_packet_t *pkt, uint32_t len,

>  		*pkt    = packet_handle(pkt_hdr);

>  	}

> 

> -	if (data_ptr)

> -		*data_ptr = packet_data(pkt_hdr);

> +	if (data_ptr || seg_len_out) {

> +		void *data_head = packet_map(pkt_hdr, 0, &seg_len, NULL);

> 

> -	if (seg_len_out)

> -		*seg_len_out = packet_first_seg_len(pkt_hdr);

> +		if (data_ptr)

> +			*data_ptr = data_head;

> 

> -	return 0;

> +		if (seg_len_out)

> +			*seg_len_out = seg_len;

> +	}

> +

> +	return ret;

>  }

> 

>  void *odp_packet_push_tail(odp_packet_t pkt, uint32_t len)

>  {

> -	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> +	odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(pkt, NULL);

>  	void *old_tail;

> 

>  	if (len > pkt_hdr->tailroom)

>  		return NULL;

> 

> +	ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

> +

>  	old_tail = packet_tail(pkt_hdr);

>  	push_tail(pkt_hdr, len);

> 

> @@ -1033,12 +1179,14 @@ void *odp_packet_push_tail(odp_packet_t pkt, uint32_t len)

>  int odp_packet_extend_tail(odp_packet_t *pkt, uint32_t len,

>  			   void **data_ptr, uint32_t *seg_len_out)

>  {

> -	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

> +	odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(*pkt, NULL);

>  	uint32_t frame_len = pkt_hdr->frame_len;

>  	uint32_t tailroom  = pkt_hdr->tailroom;

>  	uint32_t tail_off  = frame_len;

>  	int ret = 0;

> 

> +	ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

> +

>  	if (len > tailroom) {

>  		pool_t *pool = pool_entry_from_hdl(pkt_hdr->buf_hdr.pool_hdl);

>  		int num;

> @@ -1129,6 +1277,7 @@ void *odp_packet_pull_tail(odp_packet_t pkt, uint32_t len)

>  	if (len > packet_last_seg_len(pkt_hdr))

>  		return NULL;

> 

> +	ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>  	pull_tail(pkt_hdr, len);

> 

>  	return packet_tail(pkt_hdr);

> @@ -1139,17 +1288,34 @@ int odp_packet_trunc_tail(odp_packet_t *pkt, uint32_t len,

>  {

>  	int last;

>  	uint32_t seg_len;

> -	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

> +	uint32_t offset;

> +	odp_packet_hdr_t *first_hdr = odp_packet_hdr(*pkt);

> +	odp_packet_hdr_t *pkt_hdr, *prev_hdr;

> 

> -	if (len > pkt_hdr->frame_len)

> +	if (len > packet_len(first_hdr))

>  		return -1;

> 

> +	pkt_hdr = odp_packet_last_hdr(*pkt, &offset);

> +

> +	/* Special processing for references */

> +	while (len >= pkt_hdr->frame_len - offset && first_hdr->ref_hdr) {

> +		len -= (pkt_hdr->frame_len - offset);

> +		prev_hdr = odp_packet_prev_hdr(first_hdr, pkt_hdr, &offset);

> +		ODP_ASSERT(packet_ref_count(prev_hdr) == 1);

> +		prev_hdr->ref_hdr = NULL;

> +		packet_free(pkt_hdr);

> +		pkt_hdr = prev_hdr;

> +	}

> +

> +	ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>  	last    = packet_last_seg(pkt_hdr);

>  	seg_len = packet_seg_len(pkt_hdr, last);

> 

> -	if (len < seg_len) {

> +	if (CONFIG_PACKET_MAX_SEGS == 1 ||

> +	    len < seg_len ||

> +	    pkt_hdr->buf_hdr.segcount == 1) {

>  		pull_tail(pkt_hdr, len);

> -	} else if (CONFIG_PACKET_MAX_SEGS != 1) {

> +	} else {

>  		int num = 0;

>  		uint32_t pull_len = 0;

> 

> @@ -1356,35 +1522,50 @@ void odp_packet_ts_set(odp_packet_t pkt, odp_time_t timestamp)

> 

>  int odp_packet_is_segmented(odp_packet_t pkt)

>  {

> -	return odp_packet_hdr(pkt)->buf_hdr.segcount > 1;

> +	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> +

> +	return pkt_hdr->buf_hdr.segcount > 1 || pkt_hdr->ref_hdr != NULL;

>  }

> 

>  int odp_packet_num_segs(odp_packet_t pkt)

>  {

>  	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> +	uint32_t segcount = 0, i;

> +	uint32_t seg_offset = 0, offset;

> +

> +	do {

> +		segcount += pkt_hdr->buf_hdr.segcount - seg_offset;

> +		offset    = pkt_hdr->ref_offset;

> +		pkt_hdr   = pkt_hdr->ref_hdr;

> +		if (pkt_hdr) {

> +			for (i = 0, seg_offset = 0;

> +			     i < pkt_hdr->buf_hdr.segcount;

> +			     i++, seg_offset++) {

> +				if (offset < pkt_hdr->buf_hdr.seg[i].len)

> +					break;

> +				offset -= pkt_hdr->buf_hdr.seg[i].len;

> +			}

> +		}

> +	} while (pkt_hdr);

> 

> -	return pkt_hdr->buf_hdr.segcount;

> +	return segcount;

>  }

> 

> -odp_packet_seg_t odp_packet_first_seg(odp_packet_t pkt)

> +odp_packet_seg_t odp_packet_first_seg(odp_packet_t pkt ODP_UNUSED)

>  {

> -	(void)pkt;

> -

>  	return 0;

>  }

> 

>  odp_packet_seg_t odp_packet_last_seg(odp_packet_t pkt)

>  {

> -	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> -

> -	return packet_last_seg(pkt_hdr);

> +	return (odp_packet_seg_t)(odp_packet_num_segs(pkt) - 1);

>  }

> 

>  odp_packet_seg_t odp_packet_next_seg(odp_packet_t pkt, odp_packet_seg_t seg)

>  {

>  	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> 

> -	if (odp_unlikely(seg >= (odp_packet_seg_t)packet_last_seg(pkt_hdr)))

> +	if (odp_unlikely(seg >= packet_last_seg(pkt_hdr)))

>  		return ODP_PACKET_SEG_INVALID;

> 

>  	return seg + 1;

> @@ -1400,21 +1581,51 @@ odp_packet_seg_t odp_packet_next_seg(odp_packet_t pkt,

> odp_packet_seg_t seg)

>  void *odp_packet_seg_data(odp_packet_t pkt, odp_packet_seg_t seg)

>  {

>  	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> +	uint32_t seg_offset = 0, offset = 0, i;

> +

> +	while (seg >= pkt_hdr->buf_hdr.segcount - seg_offset &&

> +	       pkt_hdr->ref_hdr) {

> +		seg    -= (pkt_hdr->buf_hdr.segcount - seg_offset);

> +		offset  = pkt_hdr->ref_offset;

> +		pkt_hdr = pkt_hdr->ref_hdr;

> +		for (i = 0, seg_offset = 0;

> +		     i < pkt_hdr->buf_hdr.segcount;

> +		     i++, seg_offset++) {

> +			if (offset < pkt_hdr->buf_hdr.seg[i].len)

> +				break;

> +			offset -= pkt_hdr->buf_hdr.seg[i].len;

> +		}

> +	}

> 

> -	if (odp_unlikely(seg >= pkt_hdr->buf_hdr.segcount))

> +	if (odp_unlikely(seg + seg_offset >= pkt_hdr->buf_hdr.segcount))

>  		return NULL;

> 

> -	return packet_seg_data(pkt_hdr, seg);

> +	return packet_seg_data(pkt_hdr, seg + seg_offset) + offset;

>  }

> 

>  uint32_t odp_packet_seg_data_len(odp_packet_t pkt, odp_packet_seg_t seg)

>  {

>  	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> +	uint32_t seg_offset = 0, offset = 0, i;

> +

> +	while (seg >= pkt_hdr->buf_hdr.segcount - seg_offset &&

> +	       pkt_hdr->ref_hdr) {

> +		seg    -= (pkt_hdr->buf_hdr.segcount - seg_offset);

> +		offset  = pkt_hdr->ref_offset;

> +		pkt_hdr = pkt_hdr->ref_hdr;

> +		for (i = 0, seg_offset = 0;

> +		     i < pkt_hdr->buf_hdr.segcount;

> +		     i++, seg_offset++) {

> +			if (offset < pkt_hdr->buf_hdr.seg[i].len)

> +				break;

> +			offset -= pkt_hdr->buf_hdr.seg[i].len;

> +		}

> +	}

> 

> -	if (odp_unlikely(seg >= pkt_hdr->buf_hdr.segcount))

> +	if (odp_unlikely(seg + seg_offset >= pkt_hdr->buf_hdr.segcount))

>  		return 0;

> 

> -	return packet_seg_len(pkt_hdr, seg);

> +	return packet_seg_len(pkt_hdr, seg + seg_offset) - offset;

>  }

> 

>  /*

> @@ -1428,12 +1639,14 @@ int odp_packet_add_data(odp_packet_t *pkt_ptr, uint32_t offset,

> uint32_t len)

>  {

>  	odp_packet_t pkt = *pkt_ptr;

>  	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> -	uint32_t pktlen = pkt_hdr->frame_len;

> +	uint32_t pktlen = packet_len(pkt_hdr);

>  	odp_packet_t newpkt;

> 

>  	if (offset > pktlen)

>  		return -1;

> 

> +	ODP_ASSERT(odp_packet_unshared_len(*pkt_ptr) >= offset);

> +

>  	newpkt = odp_packet_alloc(pkt_hdr->buf_hdr.pool_hdl, pktlen + len);

> 

>  	if (newpkt == ODP_PACKET_INVALID)

> @@ -1496,6 +1709,8 @@ int odp_packet_align(odp_packet_t *pkt, uint32_t offset, uint32_t

> len,

>  	if (align > ODP_CACHE_LINE_SIZE)

>  		return -1;

> 

> +	ODP_ASSERT(odp_packet_has_ref(*pkt) == 0);

> +

>  	if (seglen >= len) {

>  		misalign = align <= 1 ? 0 :

>  			ODP_ALIGN_ROUNDUP(uaddr, align) - uaddr;

> @@ -1535,10 +1750,13 @@ int odp_packet_concat(odp_packet_t *dst, odp_packet_t src)

>  	uint32_t dst_len    = dst_hdr->frame_len;

>  	uint32_t src_len    = src_hdr->frame_len;

> 

> +	ODP_ASSERT(packet_ref_count(dst_hdr) == 1);

> +

>  	/* Do a copy if resulting packet would be out of segments or packets

> -	 * are from different pools. */

> +	 * are from different pools or src is a reference. */

>  	if (odp_unlikely((dst_segs + src_segs) > CONFIG_PACKET_MAX_SEGS) ||

> -	    odp_unlikely(dst_pool != src_pool)) {

> +	    odp_unlikely(dst_pool != src_pool) ||

> +	    odp_unlikely(packet_ref_count(src_hdr)) > 1) {

>  		if (odp_packet_extend_tail(dst, src_len, NULL, NULL) >= 0) {

>  			(void)odp_packet_copy_from_pkt(*dst, dst_len,

>  						       src, 0, src_len);

> @@ -1553,8 +1771,9 @@ int odp_packet_concat(odp_packet_t *dst, odp_packet_t src)

> 

>  	add_all_segs(dst_hdr, src_hdr);

> 

> -	dst_hdr->frame_len = dst_len + src_len;

> -	dst_hdr->tailroom  = src_hdr->tailroom;

> +	dst_hdr->frame_len    = dst_len + src_len;

> +	dst_hdr->unshared_len = dst_len + src_len;

> +	dst_hdr->tailroom     = src_hdr->tailroom;

> 

>  	/* Data was not moved in memory */

>  	return 0;

> @@ -1567,6 +1786,7 @@ int odp_packet_split(odp_packet_t *pkt, uint32_t len, odp_packet_t

> *tail)

>  	if (len >= pktlen || tail == NULL)

>  		return -1;

> 

> +	ODP_ASSERT(odp_packet_unshared_len(*pkt) >= len);

>  	*tail = odp_packet_copy_part(*pkt, len, pktlen - len,

>  				     odp_packet_pool(*pkt));

> 

> @@ -1577,6 +1797,109 @@ int odp_packet_split(odp_packet_t *pkt, uint32_t len, odp_packet_t

> *tail)

>  }

> 

>  /*

> + * References

> + */

> +

> +static inline void packet_ref(odp_packet_hdr_t *pkt_hdr)

> +{

> +	uint32_t i;

> +	odp_packet_hdr_t *hdr;

> +

> +	do {

> +		for (i = 0; i < pkt_hdr->buf_hdr.segcount; i++) {

> +			hdr = pkt_hdr->buf_hdr.seg[i].hdr;

> +			packet_ref_inc(hdr);

> +		}

> +

> +		pkt_hdr = pkt_hdr->ref_hdr;

> +	} while (pkt_hdr);

> +}

> +

> +static inline odp_packet_t packet_splice(odp_packet_hdr_t *pkt_hdr,

> +					 uint32_t offset,

> +					 odp_packet_hdr_t *ref_hdr)

> +{

> +	/* Catch attempted references to stale handles in debug builds */

> +	ODP_ASSERT(packet_ref_count(pkt_hdr) > 0);

> +

> +	/* Splicing is from the last section of src pkt */

> +	while (ref_hdr->ref_hdr)

> +		ref_hdr = ref_hdr->ref_hdr;

> +

> +	/* Find section where splice begins */

> +	while (offset >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

> +		offset   -= (pkt_hdr->frame_len - pkt_hdr->ref_offset);

> +		offset   += (pkt_hdr->ref_hdr->frame_len - pkt_hdr->ref_len);

> +		pkt_hdr   = pkt_hdr->ref_hdr;

> +	}

> +

> +	ref_hdr->ref_hdr    = pkt_hdr;

> +	ref_hdr->ref_offset = offset;

> +	ref_hdr->ref_len    = pkt_hdr->frame_len;

> +

> +	if (offset < pkt_hdr->unshared_len)

> +		pkt_hdr->unshared_len = offset;

> +

> +	packet_ref(pkt_hdr);

> +	return _odp_packet_hdl(ref_hdr);

> +}

> +

> +odp_packet_t odp_packet_ref_static(odp_packet_t pkt)

> +{

> +	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> +

> +	pkt_hdr->unshared_len = 0;

> +	packet_ref(pkt_hdr);

> +	return pkt;

> +}

> +

> +odp_packet_t odp_packet_ref(odp_packet_t pkt, uint32_t offset)

> +{

> +	odp_packet_t hdr;

> +	odp_packet_hdr_t *pkt_hdr;

> +

> +	if (pkt == ODP_PACKET_INVALID)

> +		return ODP_PACKET_INVALID;

> +

> +	pkt_hdr = odp_packet_hdr(pkt);

> +	if (offset >= packet_len(pkt_hdr))

> +		return ODP_PACKET_INVALID;

> +

> +	hdr = odp_packet_alloc(odp_packet_pool(pkt), 0);

> +

> +	if (hdr == ODP_PACKET_INVALID)

> +		return ODP_PACKET_INVALID;

> +

> +	return packet_splice(pkt_hdr, offset, odp_packet_hdr(hdr));

> +}

> +

> +odp_packet_t odp_packet_ref_pkt(odp_packet_t pkt, uint32_t offset,

> +				odp_packet_t hdr)

> +{

> +	odp_packet_hdr_t *pkt_hdr;

> +

> +	if (pkt == ODP_PACKET_INVALID ||

> +	    hdr == ODP_PACKET_INVALID ||

> +	    pkt == hdr)

> +		return ODP_PACKET_INVALID;

> +

> +	ODP_ASSERT(odp_packet_has_ref(hdr) == 0);

> +

> +	pkt_hdr = odp_packet_hdr(pkt);

> +	if (offset >= packet_len(pkt_hdr))

> +		return ODP_PACKET_INVALID;

> +

> +	return packet_splice(pkt_hdr, offset, odp_packet_hdr(hdr));

> +}

> +

> +int odp_packet_has_ref(odp_packet_t pkt)

> +{

> +	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> +

> +	return pkt_hdr->ref_hdr != NULL || packet_ref_count(pkt_hdr) > 1;

> +}

> +

> +/*

>   *

>   * Copy

>   * ********************************************************

> @@ -1585,8 +1908,7 @@ int odp_packet_split(odp_packet_t *pkt, uint32_t len, odp_packet_t

> *tail)

> 

>  odp_packet_t odp_packet_copy(odp_packet_t pkt, odp_pool_t pool)

>  {

> -	odp_packet_hdr_t *srchdr = odp_packet_hdr(pkt);

> -	uint32_t pktlen = srchdr->frame_len;

> +	uint32_t pktlen = odp_packet_len(pkt);

>  	odp_packet_t newpkt = odp_packet_alloc(pool, pktlen);

> 

>  	if (newpkt != ODP_PACKET_INVALID) {

> @@ -1625,7 +1947,7 @@ int odp_packet_copy_to_mem(odp_packet_t pkt, uint32_t offset,

>  	uint8_t *dstaddr = (uint8_t *)dst;

>  	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> 

> -	if (offset + len > pkt_hdr->frame_len)

> +	if (offset + len > packet_len(pkt_hdr))

>  		return -1;

> 

>  	while (len > 0) {

> @@ -1649,9 +1971,11 @@ int odp_packet_copy_from_mem(odp_packet_t pkt, uint32_t offset,

>  	const uint8_t *srcaddr = (const uint8_t *)src;

>  	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> 

> -	if (offset + len > pkt_hdr->frame_len)

> +	if (offset + len > packet_len(pkt_hdr))

>  		return -1;

> 

> +	ODP_ASSERT(odp_packet_unshared_len(pkt) >= offset + len);

> +

>  	while (len > 0) {

>  		mapaddr = packet_map(pkt_hdr, offset, &seglen, NULL);

>  		cpylen = len > seglen ? seglen : len;

> @@ -1677,10 +2001,12 @@ int odp_packet_copy_from_pkt(odp_packet_t dst, uint32_t

> dst_offset,

>  	uint32_t src_seglen = 0; /* GCC */

>  	int overlap;

> 

> -	if (dst_offset + len > dst_hdr->frame_len ||

> -	    src_offset + len > src_hdr->frame_len)

> +	if (dst_offset + len > packet_len(dst_hdr) ||

> +	    src_offset + len > packet_len(src_hdr))

>  		return -1;

> 

> +	ODP_ASSERT(odp_packet_unshared_len(dst) >= dst_offset + len);

> +

>  	overlap = (dst_hdr == src_hdr &&

>  		   ((dst_offset <= src_offset &&

>  		     dst_offset + len >= src_offset) ||

> @@ -1764,7 +2090,7 @@ void odp_packet_print(odp_packet_t pkt)

>  	len += snprintf(&str[len], n - len,

>  			"  l4_offset    %" PRIu32 "\n", hdr->p.l4_offset);

>  	len += snprintf(&str[len], n - len,

> -			"  frame_len    %" PRIu32 "\n", hdr->frame_len);

> +			"  frame_len    %" PRIu32 "\n", packet_len(hdr));

>  	len += snprintf(&str[len], n - len,

>  			"  input        %" PRIu64 "\n",

>  			odp_pktio_to_u64(hdr->input));

> --

> 2.9.3
Bill Fischofer Feb. 17, 2017, 8:39 p.m. UTC | #2
First off, thank you very much for this review.

Please note that this code has been streamlined in patch
http://patches.opendataplane.org/patch/7879/ and has been further
refined with patch http://patches.opendataplane.org/patch/8145/ but
the exposure you identify still exists in that code.

On Fri, Feb 17, 2017 at 11:31 AM, Peltonen, Janne (Nokia - FI/Espoo)
<janne.peltonen@nokia.com> wrote:
> Hi,

>

> I took a look at the packet references and it seems to me that

> either the implementation is a bit racy or I confused myself

> when reading the code. Or maybe I got the intended concurrency

> semantics of the packet references wrong?

>

> My first issue is that packet_free() may access freed packet

> header or corrupt unshared_len.

>

> The packet free function looks like this:

>

> static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

> {

>         odp_packet_hdr_t *ref_hdr;

>         uint32_t ref_count;

>

>         do {

>                 ref_hdr = pkt_hdr->ref_hdr;

>                 ref_count = packet_ref_count(pkt_hdr) - 1;

>                 free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

>

>                 if (ref_count == 1)

>                         pkt_hdr->unshared_len = pkt_hdr->frame_len;

>

>                 pkt_hdr = ref_hdr;

>         } while (pkt_hdr);

> }

>

> The problem here is that decrementing the ref_count, checking

> its value and updating unshared_len is not single atomic

> operation. By the time packet_free() checks if ref_count == 1

> (i.e. if there is exactly one other reference left somewhere),

> the true ref_count may have already been changed by another

> thread doing packet_free() or packet_ref().

>

> For example, if two threads have a reference to the same packet

> then execution (or the relevant memory ops) may get "interleaved"

> as follows:

>

> T1: call packet_free()

> T1: ref_count = packet_ref_count(pkt_hdr) - 1;

> At this point ref_count variable is 1

> T1: call free_bufs()

> T1: call packet_ref_dec()

> Now the ref_count of the packet header is 1.

> T2: call and complete packet_free()

> Thread 2 sees refcount 1 in the packet and frees the buffers

> T1: pkt_hdr->unshared_len = pkt_hdr->frame_len;

> Thread 1 accesses freed buffer for reading and writing.


I agree. These steps should be reversed so that the code should read:

if (ref_count == 1)
   pkt_hdr->unshared_len = pkt_hdr->frame_len;

free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

Or using the code with the above two patches applied, the code should read:

static inline void packet_free(odp_packet_hdr_t *pkt_hdr)
{
        odp_packet_hdr_t *ref_hdr;
        uint32_t ref_count;
        int num_seg;

        do {
                ref_count = packet_ref_count(pkt_hdr);
                num_seg = pkt_hdr->buf_hdr.segcount;
                ref_hdr = pkt_hdr->ref_hdr;

                if (odp_likely((CONFIG_PACKET_MAX_SEGS == 1 || num_seg == 1) &&
                    ref_count == 1)) {
                        buffer_free_multi((odp_buffer_t
*)&pkt_hdr->buf_hdr.handle.handle, 1);
                } else {
                        if (ref_count == 2)
                                pkt_hdr->unshared_len = pkt_hdr->frame_len;

                        free_bufs(pkt_hdr, 0, num_seg);
                 }

                 pkt_hdr = ref_hdr;
        } while (pkt_hdr);
}

The mistake was trying to optimize things so that unshared_len is not
set if the buffers are being freed, but that exposes these race
conditions. So the worst that should now happen is that it is set
unnecessarily before being freed.

If you concur I'll fold this fix into a v3 for patch
http://patches.opendataplane.org/patch/8145/

>

> Similarly, if T2 created a new reference, T1 would have

> a wrong idea of the number of remaining references and

> would adjust the unshared_len to an incorrect value.

>

> Right?

>

> Maybe other modifications of unshared_len are also racy.


I don't believe so, because references do not change the existing ODP
restriction that two threads cannot share the same odp_packet_t.  When
a packet reference is created it returns a separate odp_packet_t that
has its own metadata. So unshared_len is always private to an
individual odp_packet_t. The exception is static references but in
this case the entire
packet along with its metadata must be treated as read only so
operations like odp_packet_push_head() that would try to modify
unshared_len are prohibited.

>

>

>

> The second issue is that the atomic ops for setting and

> reading the ref count seem to have too relaxed memory

> ordering. In particular, packet_ref_dec() must not happen

> (be visible to other threads) before its caller is done

> with the packet and the related memory accesses have

> completed. Now there does not seem to be any optimization

> and memory barrier to prevent the ref count decrementing

> happening too early. So I think it is at least theoretically

> possible that a thread e.g. reads from a packet buffer

> after it has already been freed by another thread, somehow

> like this:

>

> Source code order:

> T1: interesting_data = read_from_pkt(pkt)

> T1: packet_free(pkt)

>

> Order visible to T2:

> 1: ref count decr

> 2: read from pkt

>

> Now if T2 goes and frees the remaining reference between

> steps 1 and 2, T1 may get even more interesting data.

>

> Right?


I don't believe so. The semantics of odp_atomic_fetch_dec_u32(), which
is what packet_ref_dec() uses, says that no two calls can see the same
fetched value, so only one thread will return ref_count == 1 and free
the buffer. Note that if I see ref_count == 1 no other thread can be
trying to increment it via a concurrent odp_packet_ref() call because
that would mean that two threads were trying to manipulate the same
odp_packet_t, which is prohibited.

For CPUs that support out of order instruction execution, this is only
permitted providing the reordering and speculative executions are
semantically consistent with sequential execution. If this were not
the case you'd constantly have to worry about a processor turning

T1: interesting_data = read_from_pkt(pkt)
T1: packet_free(pkt)

into

T1: packet_free(pkt)
T1: interesting_data = read_from_pkt(pkt)

In your scenario above: T2 cannot be issuing a read to pkt after
ref_count is decremented because the only way that T2 could be
decrementing ref_count would be if T2 issued an odp_packet_free() call
for it. Obviously if it tries to reference pkt after such a call that
is an application error.

Thanks again for your much-appreciated help in looking at this!

>

>         Janne

>

>

>> -----Original Message-----

>> From: lng-odp [mailto:lng-odp-bounces@lists.linaro.org] On Behalf Of Bill Fischofer

>> Sent: Wednesday, January 11, 2017 4:34 AM

>> To: lng-odp@lists.linaro.org

>> Subject: [lng-odp] [API-NEXT PATCHv7 2/5] linux-generic: packet: implement reference apis

>>

>> Implement the APIs:

>> - odp_packet_ref_static()

>> - odp_packet_ref()

>> - odp_packet_ref_pkt()

>> - odp_packet_has_ref()

>> - odp_packet_unshared_len()

>>

>> This also involves functional upgrades to the existing packet manipulation

>> APIs to work with packet references as input arguments.

>>

>> Signed-off-by: Bill Fischofer <bill.fischofer@linaro.org>

>> ---

>>  .../linux-generic/include/odp_packet_internal.h    |  87 +++-

>>  platform/linux-generic/odp_packet.c                | 536 +++++++++++++++++----

>>  2 files changed, 516 insertions(+), 107 deletions(-)

>>

>> diff --git a/platform/linux-generic/include/odp_packet_internal.h b/platform/linux-

>> generic/include/odp_packet_internal.h

>> index e6e9d74..607560d 100644

>> --- a/platform/linux-generic/include/odp_packet_internal.h

>> +++ b/platform/linux-generic/include/odp_packet_internal.h

>> @@ -19,6 +19,7 @@ extern "C" {

>>

>>  #include <odp/api/align.h>

>>  #include <odp/api/debug.h>

>> +#include <odp_debug_internal.h>

>>  #include <odp_buffer_internal.h>

>>  #include <odp_pool_internal.h>

>>  #include <odp_buffer_inlines.h>

>> @@ -168,7 +169,7 @@ typedef struct {

>>   * packet_init(). Because of this any new fields added must be reviewed for

>>   * initialization requirements.

>>   */

>> -typedef struct {

>> +typedef struct odp_packet_hdr_t {

>>       /* common buffer header */

>>       odp_buffer_hdr_t buf_hdr;

>>

>> @@ -184,6 +185,13 @@ typedef struct {

>>       uint32_t headroom;

>>       uint32_t tailroom;

>>

>> +     /* Fields used to support packet references */

>> +     uint32_t unshared_len;

>> +     struct odp_packet_hdr_t *ref_hdr;

>> +     uint32_t ref_offset;

>> +     uint32_t ref_len;

>> +     odp_atomic_u32_t ref_count;

>> +

>>       /*

>>        * Members below are not initialized by packet_init()

>>        */

>> @@ -212,6 +220,55 @@ static inline odp_packet_hdr_t *odp_packet_hdr(odp_packet_t pkt)

>>       return (odp_packet_hdr_t *)buf_hdl_to_hdr((odp_buffer_t)pkt);

>>  }

>>

>> +static inline odp_packet_hdr_t *odp_packet_last_hdr(odp_packet_t pkt,

>> +                                                 uint32_t *offset)

>> +{

>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> +     odp_packet_hdr_t *prev_hdr = pkt_hdr;

>> +     uint32_t ref_offset = 0;

>> +

>> +     while (pkt_hdr->ref_hdr) {

>> +             ref_offset = pkt_hdr->ref_offset;

>> +             prev_hdr   = pkt_hdr;

>> +             pkt_hdr    = pkt_hdr->ref_hdr;

>> +     }

>> +

>> +     if (offset) {

>> +             if (prev_hdr != pkt_hdr)

>> +                     ref_offset += pkt_hdr->frame_len - prev_hdr->ref_len;

>> +             *offset = ref_offset;

>> +     }

>> +

>> +     return pkt_hdr;

>> +}

>> +

>> +static inline odp_packet_hdr_t *odp_packet_prev_hdr(odp_packet_hdr_t *pkt_hdr,

>> +                                                 odp_packet_hdr_t *cur_hdr,

>> +                                                 uint32_t *offset)

>> +{

>> +     uint32_t ref_offset = 0;

>> +     odp_packet_hdr_t *prev_hdr = pkt_hdr;

>> +

>> +     while (pkt_hdr->ref_hdr != cur_hdr) {

>> +             ref_offset = pkt_hdr->ref_offset;

>> +             prev_hdr   = pkt_hdr;

>> +             pkt_hdr    = pkt_hdr->ref_hdr;

>> +     }

>> +

>> +     if (offset) {

>> +             if (prev_hdr != pkt_hdr)

>> +                     ref_offset += pkt_hdr->frame_len - prev_hdr->ref_len;

>> +             *offset = ref_offset;

>> +     }

>> +

>> +     return pkt_hdr;

>> +}

>> +

>> +static inline odp_packet_t _odp_packet_hdl(odp_packet_hdr_t *pkt_hdr)

>> +{

>> +     return (odp_packet_t)odp_hdr_to_buf(&pkt_hdr->buf_hdr);

>> +}

>> +

>>  static inline void copy_packet_parser_metadata(odp_packet_hdr_t *src_hdr,

>>                                              odp_packet_hdr_t *dst_hdr)

>>  {

>> @@ -234,17 +291,43 @@ static inline void pull_tail(odp_packet_hdr_t *pkt_hdr, uint32_t

>> len)

>>

>>       pkt_hdr->tailroom  += len;

>>       pkt_hdr->frame_len -= len;

>> +     pkt_hdr->unshared_len -= len;

>>       pkt_hdr->buf_hdr.seg[last].len -= len;

>>  }

>>

>>  static inline uint32_t packet_len(odp_packet_hdr_t *pkt_hdr)

>>  {

>> -     return pkt_hdr->frame_len;

>> +     uint32_t pkt_len = 0;

>> +     uint32_t offset  = 0;

>> +

>> +     do {

>> +             pkt_len += pkt_hdr->frame_len - offset;

>> +             offset   = pkt_hdr->ref_offset;

>> +             if (pkt_hdr->ref_hdr)

>> +                     offset += (pkt_hdr->ref_hdr->frame_len -

>> +                                pkt_hdr->ref_len);

>> +             pkt_hdr  = pkt_hdr->ref_hdr;

>> +     } while (pkt_hdr);

>> +

>> +     return pkt_len;

>> +}

>> +

>> +static inline uint32_t packet_ref_count(odp_packet_hdr_t *pkt_hdr)

>> +{

>> +     return odp_atomic_load_u32(&pkt_hdr->ref_count);

>> +}

>> +

>> +static inline void packet_ref_count_set(odp_packet_hdr_t *pkt_hdr, uint32_t n)

>> +{

>> +     odp_atomic_init_u32(&pkt_hdr->ref_count, n);

>>  }

>>

>>  static inline void packet_set_len(odp_packet_hdr_t *pkt_hdr, uint32_t len)

>>  {

>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>> +

>>       pkt_hdr->frame_len = len;

>> +     pkt_hdr->unshared_len = len;

>>  }

>>

>>  static inline int packet_parse_l2_not_done(packet_parser_t *prs)

>> diff --git a/platform/linux-generic/odp_packet.c b/platform/linux-generic/odp_packet.c

>> index f632a51..170965a 100644

>> --- a/platform/linux-generic/odp_packet.c

>> +++ b/platform/linux-generic/odp_packet.c

>> @@ -33,13 +33,24 @@ static inline odp_buffer_t buffer_handle(odp_packet_hdr_t *pkt_hdr)

>>       return pkt_hdr->buf_hdr.handle.handle;

>>  }

>>

>> +static inline uint32_t packet_ref_inc(odp_packet_hdr_t *pkt_hdr)

>> +{

>> +     return odp_atomic_fetch_inc_u32(&pkt_hdr->ref_count);

>> +}

>> +

>> +static inline uint32_t packet_ref_dec(odp_packet_hdr_t *pkt_hdr)

>> +{

>> +     return odp_atomic_fetch_dec_u32(&pkt_hdr->ref_count);

>> +}

>> +

>>  static inline uint32_t packet_seg_len(odp_packet_hdr_t *pkt_hdr,

>>                                     uint32_t seg_idx)

>>  {

>>       return pkt_hdr->buf_hdr.seg[seg_idx].len;

>>  }

>>

>> -static inline void *packet_seg_data(odp_packet_hdr_t *pkt_hdr, uint32_t seg_idx)

>> +static inline uint8_t *packet_seg_data(odp_packet_hdr_t *pkt_hdr,

>> +                                    uint32_t seg_idx)

>>  {

>>       return pkt_hdr->buf_hdr.seg[seg_idx].data;

>>  }

>> @@ -52,6 +63,11 @@ static inline int packet_last_seg(odp_packet_hdr_t *pkt_hdr)

>>               return pkt_hdr->buf_hdr.segcount - 1;

>>  }

>>

>> +static inline void *packet_data(odp_packet_hdr_t *pkt_hdr)

>> +{

>> +     return pkt_hdr->buf_hdr.seg[0].data;

>> +}

>> +

>>  static inline uint32_t packet_first_seg_len(odp_packet_hdr_t *pkt_hdr)

>>  {

>>       return packet_seg_len(pkt_hdr, 0);

>> @@ -64,11 +80,6 @@ static inline uint32_t packet_last_seg_len(odp_packet_hdr_t *pkt_hdr)

>>       return packet_seg_len(pkt_hdr, last);

>>  }

>>

>> -static inline void *packet_data(odp_packet_hdr_t *pkt_hdr)

>> -{

>> -     return pkt_hdr->buf_hdr.seg[0].data;

>> -}

>> -

>>  static inline void *packet_tail(odp_packet_hdr_t *pkt_hdr)

>>  {

>>       int last = packet_last_seg(pkt_hdr);

>> @@ -99,6 +110,7 @@ static inline void push_head(odp_packet_hdr_t *pkt_hdr, uint32_t len)

>>  {

>>       pkt_hdr->headroom  -= len;

>>       pkt_hdr->frame_len += len;

>> +     pkt_hdr->unshared_len += len;

>>       pkt_hdr->buf_hdr.seg[0].data -= len;

>>       pkt_hdr->buf_hdr.seg[0].len  += len;

>>  }

>> @@ -107,6 +119,7 @@ static inline void pull_head(odp_packet_hdr_t *pkt_hdr, uint32_t len)

>>  {

>>       pkt_hdr->headroom  += len;

>>       pkt_hdr->frame_len -= len;

>> +     pkt_hdr->unshared_len -= len;

>>       pkt_hdr->buf_hdr.seg[0].data += len;

>>       pkt_hdr->buf_hdr.seg[0].len  -= len;

>>  }

>> @@ -117,6 +130,7 @@ static inline void push_tail(odp_packet_hdr_t *pkt_hdr, uint32_t len)

>>

>>       pkt_hdr->tailroom  -= len;

>>       pkt_hdr->frame_len += len;

>> +     pkt_hdr->unshared_len += len;

>>       pkt_hdr->buf_hdr.seg[last].len += len;

>>  }

>>

>> @@ -144,6 +158,10 @@ static inline void packet_seg_copy_md(odp_packet_hdr_t *dst,

>>       dst->buf_hdr.uarea_addr = src->buf_hdr.uarea_addr;

>>       dst->buf_hdr.uarea_size = src->buf_hdr.uarea_size;

>>

>> +     /* reference related metadata */

>> +     dst->ref_len      = src->ref_len;

>> +     dst->unshared_len = src->unshared_len;

>> +

>>       /* segmentation data is not copied:

>>        *   buf_hdr.seg[]

>>        *   buf_hdr.segcount

>> @@ -158,7 +176,15 @@ static inline void *packet_map(odp_packet_hdr_t *pkt_hdr,

>>       int seg = 0;

>>       int seg_count = pkt_hdr->buf_hdr.segcount;

>>

>> -     if (odp_unlikely(offset >= pkt_hdr->frame_len))

>> +     /* Special processing for references */

>> +     while (offset >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

>> +             offset   -= (pkt_hdr->frame_len - pkt_hdr->ref_offset);

>> +             offset   += (pkt_hdr->ref_hdr->frame_len - pkt_hdr->ref_len);

>> +             pkt_hdr   = pkt_hdr->ref_hdr;

>> +             seg_count = pkt_hdr->buf_hdr.segcount;

>> +     }

>> +

>> +     if (odp_unlikely(offset > pkt_hdr->frame_len))

>>               return NULL;

>>

>>       if (odp_likely(CONFIG_PACKET_MAX_SEGS == 1 || seg_count == 1)) {

>> @@ -207,6 +233,9 @@ void packet_parse_reset(odp_packet_hdr_t *pkt_hdr)

>>       pkt_hdr->p.l2_offset        = 0;

>>       pkt_hdr->p.l3_offset        = ODP_PACKET_OFFSET_INVALID;

>>       pkt_hdr->p.l4_offset        = ODP_PACKET_OFFSET_INVALID;

>> +

>> +     /* Ensure dummy pkt_hdrs used in I/O recv classification are valid */

>> +     pkt_hdr->ref_hdr = NULL;

>>  }

>>

>>  /**

>> @@ -252,6 +281,10 @@ static inline void packet_init(odp_packet_hdr_t *pkt_hdr, uint32_t

>> len,

>>                            CONFIG_PACKET_TAILROOM;

>>

>>       pkt_hdr->input = ODP_PKTIO_INVALID;

>> +

>> +     /* By default packet has no references */

>> +     pkt_hdr->unshared_len = len;

>> +     pkt_hdr->ref_hdr = NULL;

>>  }

>>

>>  static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int num)

>> @@ -264,6 +297,7 @@ static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int num)

>>

>>       hdr->buf_hdr.seg[0].data = hdr->buf_hdr.base_data;

>>       hdr->buf_hdr.seg[0].len  = BASE_LEN;

>> +     packet_ref_count_set(hdr, 1);

>>

>>       /* Link segments */

>>       if (CONFIG_PACKET_MAX_SEGS != 1) {

>> @@ -273,6 +307,7 @@ static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int num)

>>                       for (i = 1; i < num; i++) {

>>                               odp_buffer_hdr_t *buf_hdr;

>>

>> +                             packet_ref_count_set(pkt_hdr[i], 1);

>>                               buf_hdr = &pkt_hdr[i]->buf_hdr;

>>                               hdr->buf_hdr.seg[i].hdr  = buf_hdr;

>>                               hdr->buf_hdr.seg[i].data = buf_hdr->base_data;

>> @@ -376,9 +411,10 @@ static inline odp_packet_hdr_t *add_segments(odp_packet_hdr_t

>> *pkt_hdr,

>>               new_hdr->buf_hdr.seg[0].len   = seg_len;

>>

>>               packet_seg_copy_md(new_hdr, pkt_hdr);

>> -             new_hdr->frame_len = pkt_hdr->frame_len + len;

>> -             new_hdr->headroom  = pool->headroom + offset;

>> -             new_hdr->tailroom  = pkt_hdr->tailroom;

>> +             new_hdr->frame_len    = pkt_hdr->frame_len + len;

>> +             new_hdr->unshared_len = pkt_hdr->unshared_len + len;

>> +             new_hdr->headroom     = pool->headroom + offset;

>> +             new_hdr->tailroom     = pkt_hdr->tailroom;

>>

>>               pkt_hdr = new_hdr;

>>       } else {

>> @@ -391,8 +427,9 @@ static inline odp_packet_hdr_t *add_segments(odp_packet_hdr_t

>> *pkt_hdr,

>>               last = packet_last_seg(pkt_hdr);

>>               pkt_hdr->buf_hdr.seg[last].len = seg_len;

>>

>> -             pkt_hdr->frame_len += len;

>> -             pkt_hdr->tailroom   = pool->tailroom + offset;

>> +             pkt_hdr->frame_len    += len;

>> +             pkt_hdr->unshared_len += len;

>> +             pkt_hdr->tailroom      = pool->tailroom + offset;

>>       }

>>

>>       return pkt_hdr;

>> @@ -400,13 +437,18 @@ static inline odp_packet_hdr_t *add_segments(odp_packet_hdr_t

>> *pkt_hdr,

>>

>>  static inline void free_bufs(odp_packet_hdr_t *pkt_hdr, int first, int num)

>>  {

>> -     int i;

>> +     int i, nfree;

>>       odp_buffer_t buf[num];

>>

>> -     for (i = 0; i < num; i++)

>> -             buf[i] = buffer_handle(pkt_hdr->buf_hdr.seg[first + i].hdr);

>> +     for (i = 0, nfree = 0; i < num; i++) {

>> +             odp_packet_hdr_t *hdr = pkt_hdr->buf_hdr.seg[first + i].hdr;

>> +

>> +             if (packet_ref_dec(hdr) == 1)

>> +                     buf[nfree++] = buffer_handle(hdr);

>> +     }

>>

>> -     buffer_free_multi(buf, num);

>> +     if (nfree > 0)

>> +             buffer_free_multi(buf, nfree);

>>  }

>>

>>  static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t *pkt_hdr,

>> @@ -417,11 +459,15 @@ static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t

>> *pkt_hdr,

>>

>>       if (head) {

>>               odp_packet_hdr_t *new_hdr;

>> -             int i;

>> +             int i, nfree;

>>               odp_buffer_t buf[num];

>>

>> -             for (i = 0; i < num; i++)

>> -                     buf[i] = buffer_handle(pkt_hdr->buf_hdr.seg[i].hdr);

>> +             for (i = 0, nfree = 0; i < num; i++) {

>> +                     new_hdr = pkt_hdr->buf_hdr.seg[i].hdr;

>> +

>> +                     if (packet_ref_dec(new_hdr) == 1)

>> +                             buf[nfree++] = buffer_handle(new_hdr);

>> +             }

>>

>>               /* First remaining segment is the new packet descriptor */

>>               new_hdr = pkt_hdr->buf_hdr.seg[num].hdr;

>> @@ -430,15 +476,17 @@ static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t

>> *pkt_hdr,

>>               packet_seg_copy_md(new_hdr, pkt_hdr);

>>

>>               /* Tailroom not changed */

>> -             new_hdr->tailroom  = pkt_hdr->tailroom;

>> -             new_hdr->headroom  = seg_headroom(new_hdr, 0);

>> -             new_hdr->frame_len = pkt_hdr->frame_len - free_len;

>> +             new_hdr->tailroom     = pkt_hdr->tailroom;

>> +             new_hdr->headroom     = seg_headroom(new_hdr, 0);

>> +             new_hdr->frame_len    = pkt_hdr->frame_len - free_len;

>> +             new_hdr->unshared_len = pkt_hdr->unshared_len - free_len;

>>

>>               pull_head(new_hdr, pull_len);

>>

>>               pkt_hdr = new_hdr;

>>

>> -             buffer_free_multi(buf, num);

>> +             if (nfree > 0)

>> +                     buffer_free_multi(buf, nfree);

>>       } else {

>>               /* Free last 'num' bufs */

>>               free_bufs(pkt_hdr, num_remain, num);

>> @@ -447,6 +495,7 @@ static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t

>> *pkt_hdr,

>>                * of the metadata. */

>>               pkt_hdr->buf_hdr.segcount = num_remain;

>>               pkt_hdr->frame_len -= free_len;

>> +             pkt_hdr->unshared_len -= free_len;

>>               pkt_hdr->tailroom = seg_tailroom(pkt_hdr, num_remain - 1);

>>

>>               pull_tail(pkt_hdr, pull_len);

>> @@ -550,45 +599,34 @@ int odp_packet_alloc_multi(odp_pool_t pool_hdl, uint32_t len,

>>       return num;

>>  }

>>

>> -void odp_packet_free(odp_packet_t pkt)

>> +static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

>>  {

>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> -     int num_seg = pkt_hdr->buf_hdr.segcount;

>> +     odp_packet_hdr_t *ref_hdr;

>> +     uint32_t ref_count;

>>

>> -     if (odp_likely(CONFIG_PACKET_MAX_SEGS == 1 || num_seg == 1))

>> -             buffer_free_multi((odp_buffer_t *)&pkt, 1);

>> -     else

>> -             free_bufs(pkt_hdr, 0, num_seg);

>> -}

>> +     do {

>> +             ref_hdr = pkt_hdr->ref_hdr;

>> +             ref_count = packet_ref_count(pkt_hdr) - 1;

>> +             free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

>>

>> -void odp_packet_free_multi(const odp_packet_t pkt[], int num)

>> -{

>> -     if (CONFIG_PACKET_MAX_SEGS == 1) {

>> -             buffer_free_multi((const odp_buffer_t * const)pkt, num);

>> -     } else {

>> -             odp_buffer_t buf[num * CONFIG_PACKET_MAX_SEGS];

>> -             int i, j;

>> -             int bufs = 0;

>> +             if (ref_count == 1)

>> +                     pkt_hdr->unshared_len = pkt_hdr->frame_len;

>>

>> -             for (i = 0; i < num; i++) {

>> -                     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt[i]);

>> -                     int num_seg = pkt_hdr->buf_hdr.segcount;

>> -                     odp_buffer_hdr_t *buf_hdr = &pkt_hdr->buf_hdr;

>> -

>> -                     buf[bufs] = (odp_buffer_t)pkt[i];

>> -                     bufs++;

>> +             pkt_hdr = ref_hdr;

>> +     } while (pkt_hdr);

>> +}

>>

>> -                     if (odp_likely(num_seg == 1))

>> -                             continue;

>> +void odp_packet_free(odp_packet_t pkt)

>> +{

>> +     packet_free(odp_packet_hdr(pkt));

>> +}

>>

>> -                     for (j = 1; j < num_seg; j++) {

>> -                             buf[bufs] = buffer_handle(buf_hdr->seg[j].hdr);

>> -                             bufs++;

>> -                     }

>> -             }

>> +void odp_packet_free_multi(const odp_packet_t pkt[], int num)

>> +{

>> +     int i;

>>

>> -             buffer_free_multi(buf, bufs);

>> -     }

>> +     for (i = 0; i < num; i++)

>> +             packet_free(odp_packet_hdr(pkt[i]));

>>  }

>>

>>  int odp_packet_reset(odp_packet_t pkt, uint32_t len)

>> @@ -599,6 +637,9 @@ int odp_packet_reset(odp_packet_t pkt, uint32_t len)

>>       if (len > pool->headroom + pool->data_size + pool->tailroom)

>>               return -1;

>>

>> +     if (pkt_hdr->ref_hdr)

>> +             packet_free(pkt_hdr->ref_hdr);

>> +

>>       packet_init(pkt_hdr, len, 0);

>>

>>       return 0;

>> @@ -641,15 +682,21 @@ void *odp_packet_head(odp_packet_t pkt)

>>  uint32_t odp_packet_buf_len(odp_packet_t pkt)

>>  {

>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> +     uint32_t buf_len = 0;

>>

>> -     return pkt_hdr->buf_hdr.size * pkt_hdr->buf_hdr.segcount;

>> +     do {

>> +             buf_len += pkt_hdr->buf_hdr.size * pkt_hdr->buf_hdr.segcount;

>> +             pkt_hdr  = pkt_hdr->ref_hdr;

>> +     } while (pkt_hdr);

>> +

>> +     return buf_len;

>>  }

>>

>>  void *odp_packet_data(odp_packet_t pkt)

>>  {

>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>

>> -     return packet_data(pkt_hdr);

>> +     return packet_map(pkt_hdr, 0, NULL, NULL);

>>  }

>>

>>  uint32_t odp_packet_seg_len(odp_packet_t pkt)

>> @@ -661,7 +708,32 @@ uint32_t odp_packet_seg_len(odp_packet_t pkt)

>>

>>  uint32_t odp_packet_len(odp_packet_t pkt)

>>  {

>> -     return odp_packet_hdr(pkt)->frame_len;

>> +     return packet_len(odp_packet_hdr(pkt));

>> +}

>> +

>> +uint32_t odp_packet_unshared_len(odp_packet_t pkt)

>> +{

>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> +     uint32_t pkt_len = 0, offset = 0;

>> +

>> +     do {

>> +             if (packet_ref_count(pkt_hdr) > 1) {

>> +                     if (offset == 0)

>> +                             pkt_len += pkt_hdr->unshared_len;

>> +                     break;

>> +             }

>> +

>> +             pkt_len += pkt_hdr->frame_len - offset;

>> +             offset   = pkt_hdr->ref_offset;

>> +

>> +             if (pkt_hdr->ref_hdr)

>> +                     offset += (pkt_hdr->ref_hdr->frame_len -

>> +                                pkt_hdr->ref_len);

>> +

>> +             pkt_hdr = pkt_hdr->ref_hdr;

>> +     } while (pkt_hdr);

>> +

>> +     return pkt_len;

>>  }

>>

>>  uint32_t odp_packet_headroom(odp_packet_t pkt)

>> @@ -671,12 +743,12 @@ uint32_t odp_packet_headroom(odp_packet_t pkt)

>>

>>  uint32_t odp_packet_tailroom(odp_packet_t pkt)

>>  {

>> -     return odp_packet_hdr(pkt)->tailroom;

>> +     return odp_packet_last_hdr(pkt, NULL)->tailroom;

>>  }

>>

>>  void *odp_packet_tail(odp_packet_t pkt)

>>  {

>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(pkt, NULL);

>>

>>       return packet_tail(pkt_hdr);

>>  }

>> @@ -870,7 +942,7 @@ int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,

>>  {

>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

>>       uint32_t frame_len = pkt_hdr->frame_len;

>> -     uint32_t headroom  = pkt_hdr->headroom;

>> +     uint32_t headroom = pkt_hdr->headroom;

>>       int ret = 0;

>>

>>       if (len > headroom) {

>> @@ -885,6 +957,46 @@ int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,

>>               segs = pkt_hdr->buf_hdr.segcount;

>>

>>               if (odp_unlikely((segs + num) > CONFIG_PACKET_MAX_SEGS)) {

>> +                     /* Handle recursively via references when

>> +                      * working with referenced packets since another

>> +                      * thread may be accessing it concurrently via

>> +                      * its reference to it. */

>> +                     if (packet_ref_count(pkt_hdr) > 1) {

>> +                             odp_packet_t ref;

>> +                             uint32_t unshared_len;

>> +

>> +                             push_head(pkt_hdr, headroom);

>> +                             unshared_len = pkt_hdr->unshared_len;

>> +                             ref = odp_packet_ref(*pkt, 0);

>> +

>> +                             if (ref == ODP_PACKET_INVALID) {

>> +                                     pull_head(pkt_hdr, headroom);

>> +                                     return -1;

>> +                             }

>> +

>> +                             ret = odp_packet_extend_head(&ref,

>> +                                                          len - headroom,

>> +                                                          data_ptr,

>> +                                                          seg_len);

>> +

>> +                             if (ret < 0) {

>> +                                     odp_packet_free(ref);

>> +                                     pull_head(pkt_hdr, headroom);

>> +                                     return -1;

>> +                             }

>> +

>> +                             /* Since this is a special ref, the

>> +                              * base pkt's unshared len is unchanged */

>> +                             pkt_hdr->unshared_len = unshared_len;

>> +

>> +                             /* Remove extra ref to the base pkt */

>> +                             odp_packet_free(*pkt);

>> +

>> +                             /* Return the ref as the extension result */

>> +                             *pkt = ref;

>> +                             return 1;

>> +                     }

>> +

>>                       /* Cannot directly add new segments */

>>                       odp_packet_hdr_t *new_hdr;

>>                       int new_segs = 0;

>> @@ -936,6 +1048,7 @@ int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,

>>

>>                       pkt_hdr->buf_hdr.segcount = segs;

>>                       pkt_hdr->frame_len        = frame_len;

>> +                     pkt_hdr->unshared_len     = frame_len;

>>                       pkt_hdr->headroom         = offset + pool->headroom;

>>                       pkt_hdr->tailroom         = pool->tailroom;

>>

>> @@ -961,11 +1074,16 @@ int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,

>>               push_head(pkt_hdr, len);

>>       }

>>

>> -     if (data_ptr)

>> -             *data_ptr = packet_data(pkt_hdr);

>> +     if (data_ptr || seg_len) {

>> +             uint32_t seg_ln = 0;

>> +             void *data = packet_map(pkt_hdr, 0, &seg_ln, NULL);

>>

>> -     if (seg_len)

>> -             *seg_len = packet_first_seg_len(pkt_hdr);

>> +             if (data_ptr)

>> +                     *data_ptr = data;

>> +

>> +             if (seg_len)

>> +                     *seg_len = seg_ln;

>> +     }

>>

>>       return ret;

>>  }

>> @@ -977,6 +1095,8 @@ void *odp_packet_pull_head(odp_packet_t pkt, uint32_t len)

>>       if (len > pkt_hdr->frame_len)

>>               return NULL;

>>

>> +     ODP_ASSERT(len <= pkt_hdr->unshared_len);

>> +

>>       pull_head(pkt_hdr, len);

>>       return packet_data(pkt_hdr);

>>  }

>> @@ -984,15 +1104,35 @@ void *odp_packet_pull_head(odp_packet_t pkt, uint32_t len)

>>  int odp_packet_trunc_head(odp_packet_t *pkt, uint32_t len,

>>                         void **data_ptr, uint32_t *seg_len_out)

>>  {

>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt), *nxt_hdr;

>>       uint32_t seg_len = packet_first_seg_len(pkt_hdr);

>> +     int ret = 0;

>>

>> -     if (len > pkt_hdr->frame_len)

>> +     if (len > packet_len(pkt_hdr))

>>               return -1;

>>

>> -     if (len < seg_len) {

>> +     ODP_ASSERT(len <= odp_packet_unshared_len(*pkt));

>> +

>> +     /* Special processing for references */

>> +     while (len >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

>> +             ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>> +             nxt_hdr = pkt_hdr->ref_hdr;

>> +             len -= pkt_hdr->frame_len;

>> +             len += pkt_hdr->ref_offset +

>> +                     (nxt_hdr->frame_len - pkt_hdr->ref_len);

>> +             pkt_hdr->ref_hdr = NULL;

>> +             packet_free(pkt_hdr);

>> +             pkt_hdr = nxt_hdr;

>> +             seg_len = packet_first_seg_len(pkt_hdr);

>> +             *pkt = packet_handle(pkt_hdr);

>> +             ret = 1;

>> +     }

>> +

>> +     if (CONFIG_PACKET_MAX_SEGS == 1 ||

>> +         len < seg_len ||

>> +         pkt_hdr->buf_hdr.segcount == 1) {

>>               pull_head(pkt_hdr, len);

>> -     } else if (CONFIG_PACKET_MAX_SEGS != 1) {

>> +     } else {

>>               int num = 0;

>>               uint32_t pull_len = 0;

>>

>> @@ -1007,23 +1147,29 @@ int odp_packet_trunc_head(odp_packet_t *pkt, uint32_t len,

>>               *pkt    = packet_handle(pkt_hdr);

>>       }

>>

>> -     if (data_ptr)

>> -             *data_ptr = packet_data(pkt_hdr);

>> +     if (data_ptr || seg_len_out) {

>> +             void *data_head = packet_map(pkt_hdr, 0, &seg_len, NULL);

>>

>> -     if (seg_len_out)

>> -             *seg_len_out = packet_first_seg_len(pkt_hdr);

>> +             if (data_ptr)

>> +                     *data_ptr = data_head;

>>

>> -     return 0;

>> +             if (seg_len_out)

>> +                     *seg_len_out = seg_len;

>> +     }

>> +

>> +     return ret;

>>  }

>>

>>  void *odp_packet_push_tail(odp_packet_t pkt, uint32_t len)

>>  {

>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(pkt, NULL);

>>       void *old_tail;

>>

>>       if (len > pkt_hdr->tailroom)

>>               return NULL;

>>

>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>> +

>>       old_tail = packet_tail(pkt_hdr);

>>       push_tail(pkt_hdr, len);

>>

>> @@ -1033,12 +1179,14 @@ void *odp_packet_push_tail(odp_packet_t pkt, uint32_t len)

>>  int odp_packet_extend_tail(odp_packet_t *pkt, uint32_t len,

>>                          void **data_ptr, uint32_t *seg_len_out)

>>  {

>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(*pkt, NULL);

>>       uint32_t frame_len = pkt_hdr->frame_len;

>>       uint32_t tailroom  = pkt_hdr->tailroom;

>>       uint32_t tail_off  = frame_len;

>>       int ret = 0;

>>

>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>> +

>>       if (len > tailroom) {

>>               pool_t *pool = pool_entry_from_hdl(pkt_hdr->buf_hdr.pool_hdl);

>>               int num;

>> @@ -1129,6 +1277,7 @@ void *odp_packet_pull_tail(odp_packet_t pkt, uint32_t len)

>>       if (len > packet_last_seg_len(pkt_hdr))

>>               return NULL;

>>

>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>>       pull_tail(pkt_hdr, len);

>>

>>       return packet_tail(pkt_hdr);

>> @@ -1139,17 +1288,34 @@ int odp_packet_trunc_tail(odp_packet_t *pkt, uint32_t len,

>>  {

>>       int last;

>>       uint32_t seg_len;

>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

>> +     uint32_t offset;

>> +     odp_packet_hdr_t *first_hdr = odp_packet_hdr(*pkt);

>> +     odp_packet_hdr_t *pkt_hdr, *prev_hdr;

>>

>> -     if (len > pkt_hdr->frame_len)

>> +     if (len > packet_len(first_hdr))

>>               return -1;

>>

>> +     pkt_hdr = odp_packet_last_hdr(*pkt, &offset);

>> +

>> +     /* Special processing for references */

>> +     while (len >= pkt_hdr->frame_len - offset && first_hdr->ref_hdr) {

>> +             len -= (pkt_hdr->frame_len - offset);

>> +             prev_hdr = odp_packet_prev_hdr(first_hdr, pkt_hdr, &offset);

>> +             ODP_ASSERT(packet_ref_count(prev_hdr) == 1);

>> +             prev_hdr->ref_hdr = NULL;

>> +             packet_free(pkt_hdr);

>> +             pkt_hdr = prev_hdr;

>> +     }

>> +

>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>>       last    = packet_last_seg(pkt_hdr);

>>       seg_len = packet_seg_len(pkt_hdr, last);

>>

>> -     if (len < seg_len) {

>> +     if (CONFIG_PACKET_MAX_SEGS == 1 ||

>> +         len < seg_len ||

>> +         pkt_hdr->buf_hdr.segcount == 1) {

>>               pull_tail(pkt_hdr, len);

>> -     } else if (CONFIG_PACKET_MAX_SEGS != 1) {

>> +     } else {

>>               int num = 0;

>>               uint32_t pull_len = 0;

>>

>> @@ -1356,35 +1522,50 @@ void odp_packet_ts_set(odp_packet_t pkt, odp_time_t timestamp)

>>

>>  int odp_packet_is_segmented(odp_packet_t pkt)

>>  {

>> -     return odp_packet_hdr(pkt)->buf_hdr.segcount > 1;

>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> +

>> +     return pkt_hdr->buf_hdr.segcount > 1 || pkt_hdr->ref_hdr != NULL;

>>  }

>>

>>  int odp_packet_num_segs(odp_packet_t pkt)

>>  {

>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> +     uint32_t segcount = 0, i;

>> +     uint32_t seg_offset = 0, offset;

>> +

>> +     do {

>> +             segcount += pkt_hdr->buf_hdr.segcount - seg_offset;

>> +             offset    = pkt_hdr->ref_offset;

>> +             pkt_hdr   = pkt_hdr->ref_hdr;

>> +             if (pkt_hdr) {

>> +                     for (i = 0, seg_offset = 0;

>> +                          i < pkt_hdr->buf_hdr.segcount;

>> +                          i++, seg_offset++) {

>> +                             if (offset < pkt_hdr->buf_hdr.seg[i].len)

>> +                                     break;

>> +                             offset -= pkt_hdr->buf_hdr.seg[i].len;

>> +                     }

>> +             }

>> +     } while (pkt_hdr);

>>

>> -     return pkt_hdr->buf_hdr.segcount;

>> +     return segcount;

>>  }

>>

>> -odp_packet_seg_t odp_packet_first_seg(odp_packet_t pkt)

>> +odp_packet_seg_t odp_packet_first_seg(odp_packet_t pkt ODP_UNUSED)

>>  {

>> -     (void)pkt;

>> -

>>       return 0;

>>  }

>>

>>  odp_packet_seg_t odp_packet_last_seg(odp_packet_t pkt)

>>  {

>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> -

>> -     return packet_last_seg(pkt_hdr);

>> +     return (odp_packet_seg_t)(odp_packet_num_segs(pkt) - 1);

>>  }

>>

>>  odp_packet_seg_t odp_packet_next_seg(odp_packet_t pkt, odp_packet_seg_t seg)

>>  {

>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>

>> -     if (odp_unlikely(seg >= (odp_packet_seg_t)packet_last_seg(pkt_hdr)))

>> +     if (odp_unlikely(seg >= packet_last_seg(pkt_hdr)))

>>               return ODP_PACKET_SEG_INVALID;

>>

>>       return seg + 1;

>> @@ -1400,21 +1581,51 @@ odp_packet_seg_t odp_packet_next_seg(odp_packet_t pkt,

>> odp_packet_seg_t seg)

>>  void *odp_packet_seg_data(odp_packet_t pkt, odp_packet_seg_t seg)

>>  {

>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> +     uint32_t seg_offset = 0, offset = 0, i;

>> +

>> +     while (seg >= pkt_hdr->buf_hdr.segcount - seg_offset &&

>> +            pkt_hdr->ref_hdr) {

>> +             seg    -= (pkt_hdr->buf_hdr.segcount - seg_offset);

>> +             offset  = pkt_hdr->ref_offset;

>> +             pkt_hdr = pkt_hdr->ref_hdr;

>> +             for (i = 0, seg_offset = 0;

>> +                  i < pkt_hdr->buf_hdr.segcount;

>> +                  i++, seg_offset++) {

>> +                     if (offset < pkt_hdr->buf_hdr.seg[i].len)

>> +                             break;

>> +                     offset -= pkt_hdr->buf_hdr.seg[i].len;

>> +             }

>> +     }

>>

>> -     if (odp_unlikely(seg >= pkt_hdr->buf_hdr.segcount))

>> +     if (odp_unlikely(seg + seg_offset >= pkt_hdr->buf_hdr.segcount))

>>               return NULL;

>>

>> -     return packet_seg_data(pkt_hdr, seg);

>> +     return packet_seg_data(pkt_hdr, seg + seg_offset) + offset;

>>  }

>>

>>  uint32_t odp_packet_seg_data_len(odp_packet_t pkt, odp_packet_seg_t seg)

>>  {

>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> +     uint32_t seg_offset = 0, offset = 0, i;

>> +

>> +     while (seg >= pkt_hdr->buf_hdr.segcount - seg_offset &&

>> +            pkt_hdr->ref_hdr) {

>> +             seg    -= (pkt_hdr->buf_hdr.segcount - seg_offset);

>> +             offset  = pkt_hdr->ref_offset;

>> +             pkt_hdr = pkt_hdr->ref_hdr;

>> +             for (i = 0, seg_offset = 0;

>> +                  i < pkt_hdr->buf_hdr.segcount;

>> +                  i++, seg_offset++) {

>> +                     if (offset < pkt_hdr->buf_hdr.seg[i].len)

>> +                             break;

>> +                     offset -= pkt_hdr->buf_hdr.seg[i].len;

>> +             }

>> +     }

>>

>> -     if (odp_unlikely(seg >= pkt_hdr->buf_hdr.segcount))

>> +     if (odp_unlikely(seg + seg_offset >= pkt_hdr->buf_hdr.segcount))

>>               return 0;

>>

>> -     return packet_seg_len(pkt_hdr, seg);

>> +     return packet_seg_len(pkt_hdr, seg + seg_offset) - offset;

>>  }

>>

>>  /*

>> @@ -1428,12 +1639,14 @@ int odp_packet_add_data(odp_packet_t *pkt_ptr, uint32_t offset,

>> uint32_t len)

>>  {

>>       odp_packet_t pkt = *pkt_ptr;

>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> -     uint32_t pktlen = pkt_hdr->frame_len;

>> +     uint32_t pktlen = packet_len(pkt_hdr);

>>       odp_packet_t newpkt;

>>

>>       if (offset > pktlen)

>>               return -1;

>>

>> +     ODP_ASSERT(odp_packet_unshared_len(*pkt_ptr) >= offset);

>> +

>>       newpkt = odp_packet_alloc(pkt_hdr->buf_hdr.pool_hdl, pktlen + len);

>>

>>       if (newpkt == ODP_PACKET_INVALID)

>> @@ -1496,6 +1709,8 @@ int odp_packet_align(odp_packet_t *pkt, uint32_t offset, uint32_t

>> len,

>>       if (align > ODP_CACHE_LINE_SIZE)

>>               return -1;

>>

>> +     ODP_ASSERT(odp_packet_has_ref(*pkt) == 0);

>> +

>>       if (seglen >= len) {

>>               misalign = align <= 1 ? 0 :

>>                       ODP_ALIGN_ROUNDUP(uaddr, align) - uaddr;

>> @@ -1535,10 +1750,13 @@ int odp_packet_concat(odp_packet_t *dst, odp_packet_t src)

>>       uint32_t dst_len    = dst_hdr->frame_len;

>>       uint32_t src_len    = src_hdr->frame_len;

>>

>> +     ODP_ASSERT(packet_ref_count(dst_hdr) == 1);

>> +

>>       /* Do a copy if resulting packet would be out of segments or packets

>> -      * are from different pools. */

>> +      * are from different pools or src is a reference. */

>>       if (odp_unlikely((dst_segs + src_segs) > CONFIG_PACKET_MAX_SEGS) ||

>> -         odp_unlikely(dst_pool != src_pool)) {

>> +         odp_unlikely(dst_pool != src_pool) ||

>> +         odp_unlikely(packet_ref_count(src_hdr)) > 1) {

>>               if (odp_packet_extend_tail(dst, src_len, NULL, NULL) >= 0) {

>>                       (void)odp_packet_copy_from_pkt(*dst, dst_len,

>>                                                      src, 0, src_len);

>> @@ -1553,8 +1771,9 @@ int odp_packet_concat(odp_packet_t *dst, odp_packet_t src)

>>

>>       add_all_segs(dst_hdr, src_hdr);

>>

>> -     dst_hdr->frame_len = dst_len + src_len;

>> -     dst_hdr->tailroom  = src_hdr->tailroom;

>> +     dst_hdr->frame_len    = dst_len + src_len;

>> +     dst_hdr->unshared_len = dst_len + src_len;

>> +     dst_hdr->tailroom     = src_hdr->tailroom;

>>

>>       /* Data was not moved in memory */

>>       return 0;

>> @@ -1567,6 +1786,7 @@ int odp_packet_split(odp_packet_t *pkt, uint32_t len, odp_packet_t

>> *tail)

>>       if (len >= pktlen || tail == NULL)

>>               return -1;

>>

>> +     ODP_ASSERT(odp_packet_unshared_len(*pkt) >= len);

>>       *tail = odp_packet_copy_part(*pkt, len, pktlen - len,

>>                                    odp_packet_pool(*pkt));

>>

>> @@ -1577,6 +1797,109 @@ int odp_packet_split(odp_packet_t *pkt, uint32_t len, odp_packet_t

>> *tail)

>>  }

>>

>>  /*

>> + * References

>> + */

>> +

>> +static inline void packet_ref(odp_packet_hdr_t *pkt_hdr)

>> +{

>> +     uint32_t i;

>> +     odp_packet_hdr_t *hdr;

>> +

>> +     do {

>> +             for (i = 0; i < pkt_hdr->buf_hdr.segcount; i++) {

>> +                     hdr = pkt_hdr->buf_hdr.seg[i].hdr;

>> +                     packet_ref_inc(hdr);

>> +             }

>> +

>> +             pkt_hdr = pkt_hdr->ref_hdr;

>> +     } while (pkt_hdr);

>> +}

>> +

>> +static inline odp_packet_t packet_splice(odp_packet_hdr_t *pkt_hdr,

>> +                                      uint32_t offset,

>> +                                      odp_packet_hdr_t *ref_hdr)

>> +{

>> +     /* Catch attempted references to stale handles in debug builds */

>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) > 0);

>> +

>> +     /* Splicing is from the last section of src pkt */

>> +     while (ref_hdr->ref_hdr)

>> +             ref_hdr = ref_hdr->ref_hdr;

>> +

>> +     /* Find section where splice begins */

>> +     while (offset >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

>> +             offset   -= (pkt_hdr->frame_len - pkt_hdr->ref_offset);

>> +             offset   += (pkt_hdr->ref_hdr->frame_len - pkt_hdr->ref_len);

>> +             pkt_hdr   = pkt_hdr->ref_hdr;

>> +     }

>> +

>> +     ref_hdr->ref_hdr    = pkt_hdr;

>> +     ref_hdr->ref_offset = offset;

>> +     ref_hdr->ref_len    = pkt_hdr->frame_len;

>> +

>> +     if (offset < pkt_hdr->unshared_len)

>> +             pkt_hdr->unshared_len = offset;

>> +

>> +     packet_ref(pkt_hdr);

>> +     return _odp_packet_hdl(ref_hdr);

>> +}

>> +

>> +odp_packet_t odp_packet_ref_static(odp_packet_t pkt)

>> +{

>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> +

>> +     pkt_hdr->unshared_len = 0;

>> +     packet_ref(pkt_hdr);

>> +     return pkt;

>> +}

>> +

>> +odp_packet_t odp_packet_ref(odp_packet_t pkt, uint32_t offset)

>> +{

>> +     odp_packet_t hdr;

>> +     odp_packet_hdr_t *pkt_hdr;

>> +

>> +     if (pkt == ODP_PACKET_INVALID)

>> +             return ODP_PACKET_INVALID;

>> +

>> +     pkt_hdr = odp_packet_hdr(pkt);

>> +     if (offset >= packet_len(pkt_hdr))

>> +             return ODP_PACKET_INVALID;

>> +

>> +     hdr = odp_packet_alloc(odp_packet_pool(pkt), 0);

>> +

>> +     if (hdr == ODP_PACKET_INVALID)

>> +             return ODP_PACKET_INVALID;

>> +

>> +     return packet_splice(pkt_hdr, offset, odp_packet_hdr(hdr));

>> +}

>> +

>> +odp_packet_t odp_packet_ref_pkt(odp_packet_t pkt, uint32_t offset,

>> +                             odp_packet_t hdr)

>> +{

>> +     odp_packet_hdr_t *pkt_hdr;

>> +

>> +     if (pkt == ODP_PACKET_INVALID ||

>> +         hdr == ODP_PACKET_INVALID ||

>> +         pkt == hdr)

>> +             return ODP_PACKET_INVALID;

>> +

>> +     ODP_ASSERT(odp_packet_has_ref(hdr) == 0);

>> +

>> +     pkt_hdr = odp_packet_hdr(pkt);

>> +     if (offset >= packet_len(pkt_hdr))

>> +             return ODP_PACKET_INVALID;

>> +

>> +     return packet_splice(pkt_hdr, offset, odp_packet_hdr(hdr));

>> +}

>> +

>> +int odp_packet_has_ref(odp_packet_t pkt)

>> +{

>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> +

>> +     return pkt_hdr->ref_hdr != NULL || packet_ref_count(pkt_hdr) > 1;

>> +}

>> +

>> +/*

>>   *

>>   * Copy

>>   * ********************************************************

>> @@ -1585,8 +1908,7 @@ int odp_packet_split(odp_packet_t *pkt, uint32_t len, odp_packet_t

>> *tail)

>>

>>  odp_packet_t odp_packet_copy(odp_packet_t pkt, odp_pool_t pool)

>>  {

>> -     odp_packet_hdr_t *srchdr = odp_packet_hdr(pkt);

>> -     uint32_t pktlen = srchdr->frame_len;

>> +     uint32_t pktlen = odp_packet_len(pkt);

>>       odp_packet_t newpkt = odp_packet_alloc(pool, pktlen);

>>

>>       if (newpkt != ODP_PACKET_INVALID) {

>> @@ -1625,7 +1947,7 @@ int odp_packet_copy_to_mem(odp_packet_t pkt, uint32_t offset,

>>       uint8_t *dstaddr = (uint8_t *)dst;

>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>

>> -     if (offset + len > pkt_hdr->frame_len)

>> +     if (offset + len > packet_len(pkt_hdr))

>>               return -1;

>>

>>       while (len > 0) {

>> @@ -1649,9 +1971,11 @@ int odp_packet_copy_from_mem(odp_packet_t pkt, uint32_t offset,

>>       const uint8_t *srcaddr = (const uint8_t *)src;

>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>

>> -     if (offset + len > pkt_hdr->frame_len)

>> +     if (offset + len > packet_len(pkt_hdr))

>>               return -1;

>>

>> +     ODP_ASSERT(odp_packet_unshared_len(pkt) >= offset + len);

>> +

>>       while (len > 0) {

>>               mapaddr = packet_map(pkt_hdr, offset, &seglen, NULL);

>>               cpylen = len > seglen ? seglen : len;

>> @@ -1677,10 +2001,12 @@ int odp_packet_copy_from_pkt(odp_packet_t dst, uint32_t

>> dst_offset,

>>       uint32_t src_seglen = 0; /* GCC */

>>       int overlap;

>>

>> -     if (dst_offset + len > dst_hdr->frame_len ||

>> -         src_offset + len > src_hdr->frame_len)

>> +     if (dst_offset + len > packet_len(dst_hdr) ||

>> +         src_offset + len > packet_len(src_hdr))

>>               return -1;

>>

>> +     ODP_ASSERT(odp_packet_unshared_len(dst) >= dst_offset + len);

>> +

>>       overlap = (dst_hdr == src_hdr &&

>>                  ((dst_offset <= src_offset &&

>>                    dst_offset + len >= src_offset) ||

>> @@ -1764,7 +2090,7 @@ void odp_packet_print(odp_packet_t pkt)

>>       len += snprintf(&str[len], n - len,

>>                       "  l4_offset    %" PRIu32 "\n", hdr->p.l4_offset);

>>       len += snprintf(&str[len], n - len,

>> -                     "  frame_len    %" PRIu32 "\n", hdr->frame_len);

>> +                     "  frame_len    %" PRIu32 "\n", packet_len(hdr));

>>       len += snprintf(&str[len], n - len,

>>                       "  input        %" PRIu64 "\n",

>>                       odp_pktio_to_u64(hdr->input));

>> --

>> 2.9.3

>
Bill Fischofer Feb. 17, 2017, 9:08 p.m. UTC | #3
I've posted patch http://patches.opendataplane.org/patch/8155/ to
address this issue.  It goes on api-next on top of patches
http://patches.opendataplane.org/patch/7879/ and
http://patches.opendataplane.org/patch/8154/

On Fri, Feb 17, 2017 at 2:39 PM, Bill Fischofer
<bill.fischofer@linaro.org> wrote:
> First off, thank you very much for this review.

>

> Please note that this code has been streamlined in patch

> http://patches.opendataplane.org/patch/7879/ and has been further

> refined with patch http://patches.opendataplane.org/patch/8145/ but

> the exposure you identify still exists in that code.

>

> On Fri, Feb 17, 2017 at 11:31 AM, Peltonen, Janne (Nokia - FI/Espoo)

> <janne.peltonen@nokia.com> wrote:

>> Hi,

>>

>> I took a look at the packet references and it seems to me that

>> either the implementation is a bit racy or I confused myself

>> when reading the code. Or maybe I got the intended concurrency

>> semantics of the packet references wrong?

>>

>> My first issue is that packet_free() may access freed packet

>> header or corrupt unshared_len.

>>

>> The packet free function looks like this:

>>

>> static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

>> {

>>         odp_packet_hdr_t *ref_hdr;

>>         uint32_t ref_count;

>>

>>         do {

>>                 ref_hdr = pkt_hdr->ref_hdr;

>>                 ref_count = packet_ref_count(pkt_hdr) - 1;

>>                 free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

>>

>>                 if (ref_count == 1)

>>                         pkt_hdr->unshared_len = pkt_hdr->frame_len;

>>

>>                 pkt_hdr = ref_hdr;

>>         } while (pkt_hdr);

>> }

>>

>> The problem here is that decrementing the ref_count, checking

>> its value and updating unshared_len is not single atomic

>> operation. By the time packet_free() checks if ref_count == 1

>> (i.e. if there is exactly one other reference left somewhere),

>> the true ref_count may have already been changed by another

>> thread doing packet_free() or packet_ref().

>>

>> For example, if two threads have a reference to the same packet

>> then execution (or the relevant memory ops) may get "interleaved"

>> as follows:

>>

>> T1: call packet_free()

>> T1: ref_count = packet_ref_count(pkt_hdr) - 1;

>> At this point ref_count variable is 1

>> T1: call free_bufs()

>> T1: call packet_ref_dec()

>> Now the ref_count of the packet header is 1.

>> T2: call and complete packet_free()

>> Thread 2 sees refcount 1 in the packet and frees the buffers

>> T1: pkt_hdr->unshared_len = pkt_hdr->frame_len;

>> Thread 1 accesses freed buffer for reading and writing.

>

> I agree. These steps should be reversed so that the code should read:

>

> if (ref_count == 1)

>    pkt_hdr->unshared_len = pkt_hdr->frame_len;

>

> free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

>

> Or using the code with the above two patches applied, the code should read:

>

> static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

> {

>         odp_packet_hdr_t *ref_hdr;

>         uint32_t ref_count;

>         int num_seg;

>

>         do {

>                 ref_count = packet_ref_count(pkt_hdr);

>                 num_seg = pkt_hdr->buf_hdr.segcount;

>                 ref_hdr = pkt_hdr->ref_hdr;

>

>                 if (odp_likely((CONFIG_PACKET_MAX_SEGS == 1 || num_seg == 1) &&

>                     ref_count == 1)) {

>                         buffer_free_multi((odp_buffer_t

> *)&pkt_hdr->buf_hdr.handle.handle, 1);

>                 } else {

>                         if (ref_count == 2)

>                                 pkt_hdr->unshared_len = pkt_hdr->frame_len;

>

>                         free_bufs(pkt_hdr, 0, num_seg);

>                  }

>

>                  pkt_hdr = ref_hdr;

>         } while (pkt_hdr);

> }

>

> The mistake was trying to optimize things so that unshared_len is not

> set if the buffers are being freed, but that exposes these race

> conditions. So the worst that should now happen is that it is set

> unnecessarily before being freed.

>

> If you concur I'll fold this fix into a v3 for patch

> http://patches.opendataplane.org/patch/8145/

>

>>

>> Similarly, if T2 created a new reference, T1 would have

>> a wrong idea of the number of remaining references and

>> would adjust the unshared_len to an incorrect value.

>>

>> Right?

>>

>> Maybe other modifications of unshared_len are also racy.

>

> I don't believe so, because references do not change the existing ODP

> restriction that two threads cannot share the same odp_packet_t.  When

> a packet reference is created it returns a separate odp_packet_t that

> has its own metadata. So unshared_len is always private to an

> individual odp_packet_t. The exception is static references but in

> this case the entire

> packet along with its metadata must be treated as read only so

> operations like odp_packet_push_head() that would try to modify

> unshared_len are prohibited.

>

>>

>>

>>

>> The second issue is that the atomic ops for setting and

>> reading the ref count seem to have too relaxed memory

>> ordering. In particular, packet_ref_dec() must not happen

>> (be visible to other threads) before its caller is done

>> with the packet and the related memory accesses have

>> completed. Now there does not seem to be any optimization

>> and memory barrier to prevent the ref count decrementing

>> happening too early. So I think it is at least theoretically

>> possible that a thread e.g. reads from a packet buffer

>> after it has already been freed by another thread, somehow

>> like this:

>>

>> Source code order:

>> T1: interesting_data = read_from_pkt(pkt)

>> T1: packet_free(pkt)

>>

>> Order visible to T2:

>> 1: ref count decr

>> 2: read from pkt

>>

>> Now if T2 goes and frees the remaining reference between

>> steps 1 and 2, T1 may get even more interesting data.

>>

>> Right?

>

> I don't believe so. The semantics of odp_atomic_fetch_dec_u32(), which

> is what packet_ref_dec() uses, says that no two calls can see the same

> fetched value, so only one thread will return ref_count == 1 and free

> the buffer. Note that if I see ref_count == 1 no other thread can be

> trying to increment it via a concurrent odp_packet_ref() call because

> that would mean that two threads were trying to manipulate the same

> odp_packet_t, which is prohibited.

>

> For CPUs that support out of order instruction execution, this is only

> permitted providing the reordering and speculative executions are

> semantically consistent with sequential execution. If this were not

> the case you'd constantly have to worry about a processor turning

>

> T1: interesting_data = read_from_pkt(pkt)

> T1: packet_free(pkt)

>

> into

>

> T1: packet_free(pkt)

> T1: interesting_data = read_from_pkt(pkt)

>

> In your scenario above: T2 cannot be issuing a read to pkt after

> ref_count is decremented because the only way that T2 could be

> decrementing ref_count would be if T2 issued an odp_packet_free() call

> for it. Obviously if it tries to reference pkt after such a call that

> is an application error.

>

> Thanks again for your much-appreciated help in looking at this!

>

>>

>>         Janne

>>

>>

>>> -----Original Message-----

>>> From: lng-odp [mailto:lng-odp-bounces@lists.linaro.org] On Behalf Of Bill Fischofer

>>> Sent: Wednesday, January 11, 2017 4:34 AM

>>> To: lng-odp@lists.linaro.org

>>> Subject: [lng-odp] [API-NEXT PATCHv7 2/5] linux-generic: packet: implement reference apis

>>>

>>> Implement the APIs:

>>> - odp_packet_ref_static()

>>> - odp_packet_ref()

>>> - odp_packet_ref_pkt()

>>> - odp_packet_has_ref()

>>> - odp_packet_unshared_len()

>>>

>>> This also involves functional upgrades to the existing packet manipulation

>>> APIs to work with packet references as input arguments.

>>>

>>> Signed-off-by: Bill Fischofer <bill.fischofer@linaro.org>

>>> ---

>>>  .../linux-generic/include/odp_packet_internal.h    |  87 +++-

>>>  platform/linux-generic/odp_packet.c                | 536 +++++++++++++++++----

>>>  2 files changed, 516 insertions(+), 107 deletions(-)

>>>

>>> diff --git a/platform/linux-generic/include/odp_packet_internal.h b/platform/linux-

>>> generic/include/odp_packet_internal.h

>>> index e6e9d74..607560d 100644

>>> --- a/platform/linux-generic/include/odp_packet_internal.h

>>> +++ b/platform/linux-generic/include/odp_packet_internal.h

>>> @@ -19,6 +19,7 @@ extern "C" {

>>>

>>>  #include <odp/api/align.h>

>>>  #include <odp/api/debug.h>

>>> +#include <odp_debug_internal.h>

>>>  #include <odp_buffer_internal.h>

>>>  #include <odp_pool_internal.h>

>>>  #include <odp_buffer_inlines.h>

>>> @@ -168,7 +169,7 @@ typedef struct {

>>>   * packet_init(). Because of this any new fields added must be reviewed for

>>>   * initialization requirements.

>>>   */

>>> -typedef struct {

>>> +typedef struct odp_packet_hdr_t {

>>>       /* common buffer header */

>>>       odp_buffer_hdr_t buf_hdr;

>>>

>>> @@ -184,6 +185,13 @@ typedef struct {

>>>       uint32_t headroom;

>>>       uint32_t tailroom;

>>>

>>> +     /* Fields used to support packet references */

>>> +     uint32_t unshared_len;

>>> +     struct odp_packet_hdr_t *ref_hdr;

>>> +     uint32_t ref_offset;

>>> +     uint32_t ref_len;

>>> +     odp_atomic_u32_t ref_count;

>>> +

>>>       /*

>>>        * Members below are not initialized by packet_init()

>>>        */

>>> @@ -212,6 +220,55 @@ static inline odp_packet_hdr_t *odp_packet_hdr(odp_packet_t pkt)

>>>       return (odp_packet_hdr_t *)buf_hdl_to_hdr((odp_buffer_t)pkt);

>>>  }

>>>

>>> +static inline odp_packet_hdr_t *odp_packet_last_hdr(odp_packet_t pkt,

>>> +                                                 uint32_t *offset)

>>> +{

>>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>> +     odp_packet_hdr_t *prev_hdr = pkt_hdr;

>>> +     uint32_t ref_offset = 0;

>>> +

>>> +     while (pkt_hdr->ref_hdr) {

>>> +             ref_offset = pkt_hdr->ref_offset;

>>> +             prev_hdr   = pkt_hdr;

>>> +             pkt_hdr    = pkt_hdr->ref_hdr;

>>> +     }

>>> +

>>> +     if (offset) {

>>> +             if (prev_hdr != pkt_hdr)

>>> +                     ref_offset += pkt_hdr->frame_len - prev_hdr->ref_len;

>>> +             *offset = ref_offset;

>>> +     }

>>> +

>>> +     return pkt_hdr;

>>> +}

>>> +

>>> +static inline odp_packet_hdr_t *odp_packet_prev_hdr(odp_packet_hdr_t *pkt_hdr,

>>> +                                                 odp_packet_hdr_t *cur_hdr,

>>> +                                                 uint32_t *offset)

>>> +{

>>> +     uint32_t ref_offset = 0;

>>> +     odp_packet_hdr_t *prev_hdr = pkt_hdr;

>>> +

>>> +     while (pkt_hdr->ref_hdr != cur_hdr) {

>>> +             ref_offset = pkt_hdr->ref_offset;

>>> +             prev_hdr   = pkt_hdr;

>>> +             pkt_hdr    = pkt_hdr->ref_hdr;

>>> +     }

>>> +

>>> +     if (offset) {

>>> +             if (prev_hdr != pkt_hdr)

>>> +                     ref_offset += pkt_hdr->frame_len - prev_hdr->ref_len;

>>> +             *offset = ref_offset;

>>> +     }

>>> +

>>> +     return pkt_hdr;

>>> +}

>>> +

>>> +static inline odp_packet_t _odp_packet_hdl(odp_packet_hdr_t *pkt_hdr)

>>> +{

>>> +     return (odp_packet_t)odp_hdr_to_buf(&pkt_hdr->buf_hdr);

>>> +}

>>> +

>>>  static inline void copy_packet_parser_metadata(odp_packet_hdr_t *src_hdr,

>>>                                              odp_packet_hdr_t *dst_hdr)

>>>  {

>>> @@ -234,17 +291,43 @@ static inline void pull_tail(odp_packet_hdr_t *pkt_hdr, uint32_t

>>> len)

>>>

>>>       pkt_hdr->tailroom  += len;

>>>       pkt_hdr->frame_len -= len;

>>> +     pkt_hdr->unshared_len -= len;

>>>       pkt_hdr->buf_hdr.seg[last].len -= len;

>>>  }

>>>

>>>  static inline uint32_t packet_len(odp_packet_hdr_t *pkt_hdr)

>>>  {

>>> -     return pkt_hdr->frame_len;

>>> +     uint32_t pkt_len = 0;

>>> +     uint32_t offset  = 0;

>>> +

>>> +     do {

>>> +             pkt_len += pkt_hdr->frame_len - offset;

>>> +             offset   = pkt_hdr->ref_offset;

>>> +             if (pkt_hdr->ref_hdr)

>>> +                     offset += (pkt_hdr->ref_hdr->frame_len -

>>> +                                pkt_hdr->ref_len);

>>> +             pkt_hdr  = pkt_hdr->ref_hdr;

>>> +     } while (pkt_hdr);

>>> +

>>> +     return pkt_len;

>>> +}

>>> +

>>> +static inline uint32_t packet_ref_count(odp_packet_hdr_t *pkt_hdr)

>>> +{

>>> +     return odp_atomic_load_u32(&pkt_hdr->ref_count);

>>> +}

>>> +

>>> +static inline void packet_ref_count_set(odp_packet_hdr_t *pkt_hdr, uint32_t n)

>>> +{

>>> +     odp_atomic_init_u32(&pkt_hdr->ref_count, n);

>>>  }

>>>

>>>  static inline void packet_set_len(odp_packet_hdr_t *pkt_hdr, uint32_t len)

>>>  {

>>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>>> +

>>>       pkt_hdr->frame_len = len;

>>> +     pkt_hdr->unshared_len = len;

>>>  }

>>>

>>>  static inline int packet_parse_l2_not_done(packet_parser_t *prs)

>>> diff --git a/platform/linux-generic/odp_packet.c b/platform/linux-generic/odp_packet.c

>>> index f632a51..170965a 100644

>>> --- a/platform/linux-generic/odp_packet.c

>>> +++ b/platform/linux-generic/odp_packet.c

>>> @@ -33,13 +33,24 @@ static inline odp_buffer_t buffer_handle(odp_packet_hdr_t *pkt_hdr)

>>>       return pkt_hdr->buf_hdr.handle.handle;

>>>  }

>>>

>>> +static inline uint32_t packet_ref_inc(odp_packet_hdr_t *pkt_hdr)

>>> +{

>>> +     return odp_atomic_fetch_inc_u32(&pkt_hdr->ref_count);

>>> +}

>>> +

>>> +static inline uint32_t packet_ref_dec(odp_packet_hdr_t *pkt_hdr)

>>> +{

>>> +     return odp_atomic_fetch_dec_u32(&pkt_hdr->ref_count);

>>> +}

>>> +

>>>  static inline uint32_t packet_seg_len(odp_packet_hdr_t *pkt_hdr,

>>>                                     uint32_t seg_idx)

>>>  {

>>>       return pkt_hdr->buf_hdr.seg[seg_idx].len;

>>>  }

>>>

>>> -static inline void *packet_seg_data(odp_packet_hdr_t *pkt_hdr, uint32_t seg_idx)

>>> +static inline uint8_t *packet_seg_data(odp_packet_hdr_t *pkt_hdr,

>>> +                                    uint32_t seg_idx)

>>>  {

>>>       return pkt_hdr->buf_hdr.seg[seg_idx].data;

>>>  }

>>> @@ -52,6 +63,11 @@ static inline int packet_last_seg(odp_packet_hdr_t *pkt_hdr)

>>>               return pkt_hdr->buf_hdr.segcount - 1;

>>>  }

>>>

>>> +static inline void *packet_data(odp_packet_hdr_t *pkt_hdr)

>>> +{

>>> +     return pkt_hdr->buf_hdr.seg[0].data;

>>> +}

>>> +

>>>  static inline uint32_t packet_first_seg_len(odp_packet_hdr_t *pkt_hdr)

>>>  {

>>>       return packet_seg_len(pkt_hdr, 0);

>>> @@ -64,11 +80,6 @@ static inline uint32_t packet_last_seg_len(odp_packet_hdr_t *pkt_hdr)

>>>       return packet_seg_len(pkt_hdr, last);

>>>  }

>>>

>>> -static inline void *packet_data(odp_packet_hdr_t *pkt_hdr)

>>> -{

>>> -     return pkt_hdr->buf_hdr.seg[0].data;

>>> -}

>>> -

>>>  static inline void *packet_tail(odp_packet_hdr_t *pkt_hdr)

>>>  {

>>>       int last = packet_last_seg(pkt_hdr);

>>> @@ -99,6 +110,7 @@ static inline void push_head(odp_packet_hdr_t *pkt_hdr, uint32_t len)

>>>  {

>>>       pkt_hdr->headroom  -= len;

>>>       pkt_hdr->frame_len += len;

>>> +     pkt_hdr->unshared_len += len;

>>>       pkt_hdr->buf_hdr.seg[0].data -= len;

>>>       pkt_hdr->buf_hdr.seg[0].len  += len;

>>>  }

>>> @@ -107,6 +119,7 @@ static inline void pull_head(odp_packet_hdr_t *pkt_hdr, uint32_t len)

>>>  {

>>>       pkt_hdr->headroom  += len;

>>>       pkt_hdr->frame_len -= len;

>>> +     pkt_hdr->unshared_len -= len;

>>>       pkt_hdr->buf_hdr.seg[0].data += len;

>>>       pkt_hdr->buf_hdr.seg[0].len  -= len;

>>>  }

>>> @@ -117,6 +130,7 @@ static inline void push_tail(odp_packet_hdr_t *pkt_hdr, uint32_t len)

>>>

>>>       pkt_hdr->tailroom  -= len;

>>>       pkt_hdr->frame_len += len;

>>> +     pkt_hdr->unshared_len += len;

>>>       pkt_hdr->buf_hdr.seg[last].len += len;

>>>  }

>>>

>>> @@ -144,6 +158,10 @@ static inline void packet_seg_copy_md(odp_packet_hdr_t *dst,

>>>       dst->buf_hdr.uarea_addr = src->buf_hdr.uarea_addr;

>>>       dst->buf_hdr.uarea_size = src->buf_hdr.uarea_size;

>>>

>>> +     /* reference related metadata */

>>> +     dst->ref_len      = src->ref_len;

>>> +     dst->unshared_len = src->unshared_len;

>>> +

>>>       /* segmentation data is not copied:

>>>        *   buf_hdr.seg[]

>>>        *   buf_hdr.segcount

>>> @@ -158,7 +176,15 @@ static inline void *packet_map(odp_packet_hdr_t *pkt_hdr,

>>>       int seg = 0;

>>>       int seg_count = pkt_hdr->buf_hdr.segcount;

>>>

>>> -     if (odp_unlikely(offset >= pkt_hdr->frame_len))

>>> +     /* Special processing for references */

>>> +     while (offset >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

>>> +             offset   -= (pkt_hdr->frame_len - pkt_hdr->ref_offset);

>>> +             offset   += (pkt_hdr->ref_hdr->frame_len - pkt_hdr->ref_len);

>>> +             pkt_hdr   = pkt_hdr->ref_hdr;

>>> +             seg_count = pkt_hdr->buf_hdr.segcount;

>>> +     }

>>> +

>>> +     if (odp_unlikely(offset > pkt_hdr->frame_len))

>>>               return NULL;

>>>

>>>       if (odp_likely(CONFIG_PACKET_MAX_SEGS == 1 || seg_count == 1)) {

>>> @@ -207,6 +233,9 @@ void packet_parse_reset(odp_packet_hdr_t *pkt_hdr)

>>>       pkt_hdr->p.l2_offset        = 0;

>>>       pkt_hdr->p.l3_offset        = ODP_PACKET_OFFSET_INVALID;

>>>       pkt_hdr->p.l4_offset        = ODP_PACKET_OFFSET_INVALID;

>>> +

>>> +     /* Ensure dummy pkt_hdrs used in I/O recv classification are valid */

>>> +     pkt_hdr->ref_hdr = NULL;

>>>  }

>>>

>>>  /**

>>> @@ -252,6 +281,10 @@ static inline void packet_init(odp_packet_hdr_t *pkt_hdr, uint32_t

>>> len,

>>>                            CONFIG_PACKET_TAILROOM;

>>>

>>>       pkt_hdr->input = ODP_PKTIO_INVALID;

>>> +

>>> +     /* By default packet has no references */

>>> +     pkt_hdr->unshared_len = len;

>>> +     pkt_hdr->ref_hdr = NULL;

>>>  }

>>>

>>>  static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int num)

>>> @@ -264,6 +297,7 @@ static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int num)

>>>

>>>       hdr->buf_hdr.seg[0].data = hdr->buf_hdr.base_data;

>>>       hdr->buf_hdr.seg[0].len  = BASE_LEN;

>>> +     packet_ref_count_set(hdr, 1);

>>>

>>>       /* Link segments */

>>>       if (CONFIG_PACKET_MAX_SEGS != 1) {

>>> @@ -273,6 +307,7 @@ static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int num)

>>>                       for (i = 1; i < num; i++) {

>>>                               odp_buffer_hdr_t *buf_hdr;

>>>

>>> +                             packet_ref_count_set(pkt_hdr[i], 1);

>>>                               buf_hdr = &pkt_hdr[i]->buf_hdr;

>>>                               hdr->buf_hdr.seg[i].hdr  = buf_hdr;

>>>                               hdr->buf_hdr.seg[i].data = buf_hdr->base_data;

>>> @@ -376,9 +411,10 @@ static inline odp_packet_hdr_t *add_segments(odp_packet_hdr_t

>>> *pkt_hdr,

>>>               new_hdr->buf_hdr.seg[0].len   = seg_len;

>>>

>>>               packet_seg_copy_md(new_hdr, pkt_hdr);

>>> -             new_hdr->frame_len = pkt_hdr->frame_len + len;

>>> -             new_hdr->headroom  = pool->headroom + offset;

>>> -             new_hdr->tailroom  = pkt_hdr->tailroom;

>>> +             new_hdr->frame_len    = pkt_hdr->frame_len + len;

>>> +             new_hdr->unshared_len = pkt_hdr->unshared_len + len;

>>> +             new_hdr->headroom     = pool->headroom + offset;

>>> +             new_hdr->tailroom     = pkt_hdr->tailroom;

>>>

>>>               pkt_hdr = new_hdr;

>>>       } else {

>>> @@ -391,8 +427,9 @@ static inline odp_packet_hdr_t *add_segments(odp_packet_hdr_t

>>> *pkt_hdr,

>>>               last = packet_last_seg(pkt_hdr);

>>>               pkt_hdr->buf_hdr.seg[last].len = seg_len;

>>>

>>> -             pkt_hdr->frame_len += len;

>>> -             pkt_hdr->tailroom   = pool->tailroom + offset;

>>> +             pkt_hdr->frame_len    += len;

>>> +             pkt_hdr->unshared_len += len;

>>> +             pkt_hdr->tailroom      = pool->tailroom + offset;

>>>       }

>>>

>>>       return pkt_hdr;

>>> @@ -400,13 +437,18 @@ static inline odp_packet_hdr_t *add_segments(odp_packet_hdr_t

>>> *pkt_hdr,

>>>

>>>  static inline void free_bufs(odp_packet_hdr_t *pkt_hdr, int first, int num)

>>>  {

>>> -     int i;

>>> +     int i, nfree;

>>>       odp_buffer_t buf[num];

>>>

>>> -     for (i = 0; i < num; i++)

>>> -             buf[i] = buffer_handle(pkt_hdr->buf_hdr.seg[first + i].hdr);

>>> +     for (i = 0, nfree = 0; i < num; i++) {

>>> +             odp_packet_hdr_t *hdr = pkt_hdr->buf_hdr.seg[first + i].hdr;

>>> +

>>> +             if (packet_ref_dec(hdr) == 1)

>>> +                     buf[nfree++] = buffer_handle(hdr);

>>> +     }

>>>

>>> -     buffer_free_multi(buf, num);

>>> +     if (nfree > 0)

>>> +             buffer_free_multi(buf, nfree);

>>>  }

>>>

>>>  static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t *pkt_hdr,

>>> @@ -417,11 +459,15 @@ static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t

>>> *pkt_hdr,

>>>

>>>       if (head) {

>>>               odp_packet_hdr_t *new_hdr;

>>> -             int i;

>>> +             int i, nfree;

>>>               odp_buffer_t buf[num];

>>>

>>> -             for (i = 0; i < num; i++)

>>> -                     buf[i] = buffer_handle(pkt_hdr->buf_hdr.seg[i].hdr);

>>> +             for (i = 0, nfree = 0; i < num; i++) {

>>> +                     new_hdr = pkt_hdr->buf_hdr.seg[i].hdr;

>>> +

>>> +                     if (packet_ref_dec(new_hdr) == 1)

>>> +                             buf[nfree++] = buffer_handle(new_hdr);

>>> +             }

>>>

>>>               /* First remaining segment is the new packet descriptor */

>>>               new_hdr = pkt_hdr->buf_hdr.seg[num].hdr;

>>> @@ -430,15 +476,17 @@ static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t

>>> *pkt_hdr,

>>>               packet_seg_copy_md(new_hdr, pkt_hdr);

>>>

>>>               /* Tailroom not changed */

>>> -             new_hdr->tailroom  = pkt_hdr->tailroom;

>>> -             new_hdr->headroom  = seg_headroom(new_hdr, 0);

>>> -             new_hdr->frame_len = pkt_hdr->frame_len - free_len;

>>> +             new_hdr->tailroom     = pkt_hdr->tailroom;

>>> +             new_hdr->headroom     = seg_headroom(new_hdr, 0);

>>> +             new_hdr->frame_len    = pkt_hdr->frame_len - free_len;

>>> +             new_hdr->unshared_len = pkt_hdr->unshared_len - free_len;

>>>

>>>               pull_head(new_hdr, pull_len);

>>>

>>>               pkt_hdr = new_hdr;

>>>

>>> -             buffer_free_multi(buf, num);

>>> +             if (nfree > 0)

>>> +                     buffer_free_multi(buf, nfree);

>>>       } else {

>>>               /* Free last 'num' bufs */

>>>               free_bufs(pkt_hdr, num_remain, num);

>>> @@ -447,6 +495,7 @@ static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t

>>> *pkt_hdr,

>>>                * of the metadata. */

>>>               pkt_hdr->buf_hdr.segcount = num_remain;

>>>               pkt_hdr->frame_len -= free_len;

>>> +             pkt_hdr->unshared_len -= free_len;

>>>               pkt_hdr->tailroom = seg_tailroom(pkt_hdr, num_remain - 1);

>>>

>>>               pull_tail(pkt_hdr, pull_len);

>>> @@ -550,45 +599,34 @@ int odp_packet_alloc_multi(odp_pool_t pool_hdl, uint32_t len,

>>>       return num;

>>>  }

>>>

>>> -void odp_packet_free(odp_packet_t pkt)

>>> +static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

>>>  {

>>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>> -     int num_seg = pkt_hdr->buf_hdr.segcount;

>>> +     odp_packet_hdr_t *ref_hdr;

>>> +     uint32_t ref_count;

>>>

>>> -     if (odp_likely(CONFIG_PACKET_MAX_SEGS == 1 || num_seg == 1))

>>> -             buffer_free_multi((odp_buffer_t *)&pkt, 1);

>>> -     else

>>> -             free_bufs(pkt_hdr, 0, num_seg);

>>> -}

>>> +     do {

>>> +             ref_hdr = pkt_hdr->ref_hdr;

>>> +             ref_count = packet_ref_count(pkt_hdr) - 1;

>>> +             free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

>>>

>>> -void odp_packet_free_multi(const odp_packet_t pkt[], int num)

>>> -{

>>> -     if (CONFIG_PACKET_MAX_SEGS == 1) {

>>> -             buffer_free_multi((const odp_buffer_t * const)pkt, num);

>>> -     } else {

>>> -             odp_buffer_t buf[num * CONFIG_PACKET_MAX_SEGS];

>>> -             int i, j;

>>> -             int bufs = 0;

>>> +             if (ref_count == 1)

>>> +                     pkt_hdr->unshared_len = pkt_hdr->frame_len;

>>>

>>> -             for (i = 0; i < num; i++) {

>>> -                     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt[i]);

>>> -                     int num_seg = pkt_hdr->buf_hdr.segcount;

>>> -                     odp_buffer_hdr_t *buf_hdr = &pkt_hdr->buf_hdr;

>>> -

>>> -                     buf[bufs] = (odp_buffer_t)pkt[i];

>>> -                     bufs++;

>>> +             pkt_hdr = ref_hdr;

>>> +     } while (pkt_hdr);

>>> +}

>>>

>>> -                     if (odp_likely(num_seg == 1))

>>> -                             continue;

>>> +void odp_packet_free(odp_packet_t pkt)

>>> +{

>>> +     packet_free(odp_packet_hdr(pkt));

>>> +}

>>>

>>> -                     for (j = 1; j < num_seg; j++) {

>>> -                             buf[bufs] = buffer_handle(buf_hdr->seg[j].hdr);

>>> -                             bufs++;

>>> -                     }

>>> -             }

>>> +void odp_packet_free_multi(const odp_packet_t pkt[], int num)

>>> +{

>>> +     int i;

>>>

>>> -             buffer_free_multi(buf, bufs);

>>> -     }

>>> +     for (i = 0; i < num; i++)

>>> +             packet_free(odp_packet_hdr(pkt[i]));

>>>  }

>>>

>>>  int odp_packet_reset(odp_packet_t pkt, uint32_t len)

>>> @@ -599,6 +637,9 @@ int odp_packet_reset(odp_packet_t pkt, uint32_t len)

>>>       if (len > pool->headroom + pool->data_size + pool->tailroom)

>>>               return -1;

>>>

>>> +     if (pkt_hdr->ref_hdr)

>>> +             packet_free(pkt_hdr->ref_hdr);

>>> +

>>>       packet_init(pkt_hdr, len, 0);

>>>

>>>       return 0;

>>> @@ -641,15 +682,21 @@ void *odp_packet_head(odp_packet_t pkt)

>>>  uint32_t odp_packet_buf_len(odp_packet_t pkt)

>>>  {

>>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>> +     uint32_t buf_len = 0;

>>>

>>> -     return pkt_hdr->buf_hdr.size * pkt_hdr->buf_hdr.segcount;

>>> +     do {

>>> +             buf_len += pkt_hdr->buf_hdr.size * pkt_hdr->buf_hdr.segcount;

>>> +             pkt_hdr  = pkt_hdr->ref_hdr;

>>> +     } while (pkt_hdr);

>>> +

>>> +     return buf_len;

>>>  }

>>>

>>>  void *odp_packet_data(odp_packet_t pkt)

>>>  {

>>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>>

>>> -     return packet_data(pkt_hdr);

>>> +     return packet_map(pkt_hdr, 0, NULL, NULL);

>>>  }

>>>

>>>  uint32_t odp_packet_seg_len(odp_packet_t pkt)

>>> @@ -661,7 +708,32 @@ uint32_t odp_packet_seg_len(odp_packet_t pkt)

>>>

>>>  uint32_t odp_packet_len(odp_packet_t pkt)

>>>  {

>>> -     return odp_packet_hdr(pkt)->frame_len;

>>> +     return packet_len(odp_packet_hdr(pkt));

>>> +}

>>> +

>>> +uint32_t odp_packet_unshared_len(odp_packet_t pkt)

>>> +{

>>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>> +     uint32_t pkt_len = 0, offset = 0;

>>> +

>>> +     do {

>>> +             if (packet_ref_count(pkt_hdr) > 1) {

>>> +                     if (offset == 0)

>>> +                             pkt_len += pkt_hdr->unshared_len;

>>> +                     break;

>>> +             }

>>> +

>>> +             pkt_len += pkt_hdr->frame_len - offset;

>>> +             offset   = pkt_hdr->ref_offset;

>>> +

>>> +             if (pkt_hdr->ref_hdr)

>>> +                     offset += (pkt_hdr->ref_hdr->frame_len -

>>> +                                pkt_hdr->ref_len);

>>> +

>>> +             pkt_hdr = pkt_hdr->ref_hdr;

>>> +     } while (pkt_hdr);

>>> +

>>> +     return pkt_len;

>>>  }

>>>

>>>  uint32_t odp_packet_headroom(odp_packet_t pkt)

>>> @@ -671,12 +743,12 @@ uint32_t odp_packet_headroom(odp_packet_t pkt)

>>>

>>>  uint32_t odp_packet_tailroom(odp_packet_t pkt)

>>>  {

>>> -     return odp_packet_hdr(pkt)->tailroom;

>>> +     return odp_packet_last_hdr(pkt, NULL)->tailroom;

>>>  }

>>>

>>>  void *odp_packet_tail(odp_packet_t pkt)

>>>  {

>>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(pkt, NULL);

>>>

>>>       return packet_tail(pkt_hdr);

>>>  }

>>> @@ -870,7 +942,7 @@ int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,

>>>  {

>>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

>>>       uint32_t frame_len = pkt_hdr->frame_len;

>>> -     uint32_t headroom  = pkt_hdr->headroom;

>>> +     uint32_t headroom = pkt_hdr->headroom;

>>>       int ret = 0;

>>>

>>>       if (len > headroom) {

>>> @@ -885,6 +957,46 @@ int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,

>>>               segs = pkt_hdr->buf_hdr.segcount;

>>>

>>>               if (odp_unlikely((segs + num) > CONFIG_PACKET_MAX_SEGS)) {

>>> +                     /* Handle recursively via references when

>>> +                      * working with referenced packets since another

>>> +                      * thread may be accessing it concurrently via

>>> +                      * its reference to it. */

>>> +                     if (packet_ref_count(pkt_hdr) > 1) {

>>> +                             odp_packet_t ref;

>>> +                             uint32_t unshared_len;

>>> +

>>> +                             push_head(pkt_hdr, headroom);

>>> +                             unshared_len = pkt_hdr->unshared_len;

>>> +                             ref = odp_packet_ref(*pkt, 0);

>>> +

>>> +                             if (ref == ODP_PACKET_INVALID) {

>>> +                                     pull_head(pkt_hdr, headroom);

>>> +                                     return -1;

>>> +                             }

>>> +

>>> +                             ret = odp_packet_extend_head(&ref,

>>> +                                                          len - headroom,

>>> +                                                          data_ptr,

>>> +                                                          seg_len);

>>> +

>>> +                             if (ret < 0) {

>>> +                                     odp_packet_free(ref);

>>> +                                     pull_head(pkt_hdr, headroom);

>>> +                                     return -1;

>>> +                             }

>>> +

>>> +                             /* Since this is a special ref, the

>>> +                              * base pkt's unshared len is unchanged */

>>> +                             pkt_hdr->unshared_len = unshared_len;

>>> +

>>> +                             /* Remove extra ref to the base pkt */

>>> +                             odp_packet_free(*pkt);

>>> +

>>> +                             /* Return the ref as the extension result */

>>> +                             *pkt = ref;

>>> +                             return 1;

>>> +                     }

>>> +

>>>                       /* Cannot directly add new segments */

>>>                       odp_packet_hdr_t *new_hdr;

>>>                       int new_segs = 0;

>>> @@ -936,6 +1048,7 @@ int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,

>>>

>>>                       pkt_hdr->buf_hdr.segcount = segs;

>>>                       pkt_hdr->frame_len        = frame_len;

>>> +                     pkt_hdr->unshared_len     = frame_len;

>>>                       pkt_hdr->headroom         = offset + pool->headroom;

>>>                       pkt_hdr->tailroom         = pool->tailroom;

>>>

>>> @@ -961,11 +1074,16 @@ int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,

>>>               push_head(pkt_hdr, len);

>>>       }

>>>

>>> -     if (data_ptr)

>>> -             *data_ptr = packet_data(pkt_hdr);

>>> +     if (data_ptr || seg_len) {

>>> +             uint32_t seg_ln = 0;

>>> +             void *data = packet_map(pkt_hdr, 0, &seg_ln, NULL);

>>>

>>> -     if (seg_len)

>>> -             *seg_len = packet_first_seg_len(pkt_hdr);

>>> +             if (data_ptr)

>>> +                     *data_ptr = data;

>>> +

>>> +             if (seg_len)

>>> +                     *seg_len = seg_ln;

>>> +     }

>>>

>>>       return ret;

>>>  }

>>> @@ -977,6 +1095,8 @@ void *odp_packet_pull_head(odp_packet_t pkt, uint32_t len)

>>>       if (len > pkt_hdr->frame_len)

>>>               return NULL;

>>>

>>> +     ODP_ASSERT(len <= pkt_hdr->unshared_len);

>>> +

>>>       pull_head(pkt_hdr, len);

>>>       return packet_data(pkt_hdr);

>>>  }

>>> @@ -984,15 +1104,35 @@ void *odp_packet_pull_head(odp_packet_t pkt, uint32_t len)

>>>  int odp_packet_trunc_head(odp_packet_t *pkt, uint32_t len,

>>>                         void **data_ptr, uint32_t *seg_len_out)

>>>  {

>>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

>>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt), *nxt_hdr;

>>>       uint32_t seg_len = packet_first_seg_len(pkt_hdr);

>>> +     int ret = 0;

>>>

>>> -     if (len > pkt_hdr->frame_len)

>>> +     if (len > packet_len(pkt_hdr))

>>>               return -1;

>>>

>>> -     if (len < seg_len) {

>>> +     ODP_ASSERT(len <= odp_packet_unshared_len(*pkt));

>>> +

>>> +     /* Special processing for references */

>>> +     while (len >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

>>> +             ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>>> +             nxt_hdr = pkt_hdr->ref_hdr;

>>> +             len -= pkt_hdr->frame_len;

>>> +             len += pkt_hdr->ref_offset +

>>> +                     (nxt_hdr->frame_len - pkt_hdr->ref_len);

>>> +             pkt_hdr->ref_hdr = NULL;

>>> +             packet_free(pkt_hdr);

>>> +             pkt_hdr = nxt_hdr;

>>> +             seg_len = packet_first_seg_len(pkt_hdr);

>>> +             *pkt = packet_handle(pkt_hdr);

>>> +             ret = 1;

>>> +     }

>>> +

>>> +     if (CONFIG_PACKET_MAX_SEGS == 1 ||

>>> +         len < seg_len ||

>>> +         pkt_hdr->buf_hdr.segcount == 1) {

>>>               pull_head(pkt_hdr, len);

>>> -     } else if (CONFIG_PACKET_MAX_SEGS != 1) {

>>> +     } else {

>>>               int num = 0;

>>>               uint32_t pull_len = 0;

>>>

>>> @@ -1007,23 +1147,29 @@ int odp_packet_trunc_head(odp_packet_t *pkt, uint32_t len,

>>>               *pkt    = packet_handle(pkt_hdr);

>>>       }

>>>

>>> -     if (data_ptr)

>>> -             *data_ptr = packet_data(pkt_hdr);

>>> +     if (data_ptr || seg_len_out) {

>>> +             void *data_head = packet_map(pkt_hdr, 0, &seg_len, NULL);

>>>

>>> -     if (seg_len_out)

>>> -             *seg_len_out = packet_first_seg_len(pkt_hdr);

>>> +             if (data_ptr)

>>> +                     *data_ptr = data_head;

>>>

>>> -     return 0;

>>> +             if (seg_len_out)

>>> +                     *seg_len_out = seg_len;

>>> +     }

>>> +

>>> +     return ret;

>>>  }

>>>

>>>  void *odp_packet_push_tail(odp_packet_t pkt, uint32_t len)

>>>  {

>>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(pkt, NULL);

>>>       void *old_tail;

>>>

>>>       if (len > pkt_hdr->tailroom)

>>>               return NULL;

>>>

>>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>>> +

>>>       old_tail = packet_tail(pkt_hdr);

>>>       push_tail(pkt_hdr, len);

>>>

>>> @@ -1033,12 +1179,14 @@ void *odp_packet_push_tail(odp_packet_t pkt, uint32_t len)

>>>  int odp_packet_extend_tail(odp_packet_t *pkt, uint32_t len,

>>>                          void **data_ptr, uint32_t *seg_len_out)

>>>  {

>>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

>>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(*pkt, NULL);

>>>       uint32_t frame_len = pkt_hdr->frame_len;

>>>       uint32_t tailroom  = pkt_hdr->tailroom;

>>>       uint32_t tail_off  = frame_len;

>>>       int ret = 0;

>>>

>>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>>> +

>>>       if (len > tailroom) {

>>>               pool_t *pool = pool_entry_from_hdl(pkt_hdr->buf_hdr.pool_hdl);

>>>               int num;

>>> @@ -1129,6 +1277,7 @@ void *odp_packet_pull_tail(odp_packet_t pkt, uint32_t len)

>>>       if (len > packet_last_seg_len(pkt_hdr))

>>>               return NULL;

>>>

>>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>>>       pull_tail(pkt_hdr, len);

>>>

>>>       return packet_tail(pkt_hdr);

>>> @@ -1139,17 +1288,34 @@ int odp_packet_trunc_tail(odp_packet_t *pkt, uint32_t len,

>>>  {

>>>       int last;

>>>       uint32_t seg_len;

>>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

>>> +     uint32_t offset;

>>> +     odp_packet_hdr_t *first_hdr = odp_packet_hdr(*pkt);

>>> +     odp_packet_hdr_t *pkt_hdr, *prev_hdr;

>>>

>>> -     if (len > pkt_hdr->frame_len)

>>> +     if (len > packet_len(first_hdr))

>>>               return -1;

>>>

>>> +     pkt_hdr = odp_packet_last_hdr(*pkt, &offset);

>>> +

>>> +     /* Special processing for references */

>>> +     while (len >= pkt_hdr->frame_len - offset && first_hdr->ref_hdr) {

>>> +             len -= (pkt_hdr->frame_len - offset);

>>> +             prev_hdr = odp_packet_prev_hdr(first_hdr, pkt_hdr, &offset);

>>> +             ODP_ASSERT(packet_ref_count(prev_hdr) == 1);

>>> +             prev_hdr->ref_hdr = NULL;

>>> +             packet_free(pkt_hdr);

>>> +             pkt_hdr = prev_hdr;

>>> +     }

>>> +

>>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>>>       last    = packet_last_seg(pkt_hdr);

>>>       seg_len = packet_seg_len(pkt_hdr, last);

>>>

>>> -     if (len < seg_len) {

>>> +     if (CONFIG_PACKET_MAX_SEGS == 1 ||

>>> +         len < seg_len ||

>>> +         pkt_hdr->buf_hdr.segcount == 1) {

>>>               pull_tail(pkt_hdr, len);

>>> -     } else if (CONFIG_PACKET_MAX_SEGS != 1) {

>>> +     } else {

>>>               int num = 0;

>>>               uint32_t pull_len = 0;

>>>

>>> @@ -1356,35 +1522,50 @@ void odp_packet_ts_set(odp_packet_t pkt, odp_time_t timestamp)

>>>

>>>  int odp_packet_is_segmented(odp_packet_t pkt)

>>>  {

>>> -     return odp_packet_hdr(pkt)->buf_hdr.segcount > 1;

>>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>> +

>>> +     return pkt_hdr->buf_hdr.segcount > 1 || pkt_hdr->ref_hdr != NULL;

>>>  }

>>>

>>>  int odp_packet_num_segs(odp_packet_t pkt)

>>>  {

>>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>> +     uint32_t segcount = 0, i;

>>> +     uint32_t seg_offset = 0, offset;

>>> +

>>> +     do {

>>> +             segcount += pkt_hdr->buf_hdr.segcount - seg_offset;

>>> +             offset    = pkt_hdr->ref_offset;

>>> +             pkt_hdr   = pkt_hdr->ref_hdr;

>>> +             if (pkt_hdr) {

>>> +                     for (i = 0, seg_offset = 0;

>>> +                          i < pkt_hdr->buf_hdr.segcount;

>>> +                          i++, seg_offset++) {

>>> +                             if (offset < pkt_hdr->buf_hdr.seg[i].len)

>>> +                                     break;

>>> +                             offset -= pkt_hdr->buf_hdr.seg[i].len;

>>> +                     }

>>> +             }

>>> +     } while (pkt_hdr);

>>>

>>> -     return pkt_hdr->buf_hdr.segcount;

>>> +     return segcount;

>>>  }

>>>

>>> -odp_packet_seg_t odp_packet_first_seg(odp_packet_t pkt)

>>> +odp_packet_seg_t odp_packet_first_seg(odp_packet_t pkt ODP_UNUSED)

>>>  {

>>> -     (void)pkt;

>>> -

>>>       return 0;

>>>  }

>>>

>>>  odp_packet_seg_t odp_packet_last_seg(odp_packet_t pkt)

>>>  {

>>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>> -

>>> -     return packet_last_seg(pkt_hdr);

>>> +     return (odp_packet_seg_t)(odp_packet_num_segs(pkt) - 1);

>>>  }

>>>

>>>  odp_packet_seg_t odp_packet_next_seg(odp_packet_t pkt, odp_packet_seg_t seg)

>>>  {

>>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>>

>>> -     if (odp_unlikely(seg >= (odp_packet_seg_t)packet_last_seg(pkt_hdr)))

>>> +     if (odp_unlikely(seg >= packet_last_seg(pkt_hdr)))

>>>               return ODP_PACKET_SEG_INVALID;

>>>

>>>       return seg + 1;

>>> @@ -1400,21 +1581,51 @@ odp_packet_seg_t odp_packet_next_seg(odp_packet_t pkt,

>>> odp_packet_seg_t seg)

>>>  void *odp_packet_seg_data(odp_packet_t pkt, odp_packet_seg_t seg)

>>>  {

>>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>> +     uint32_t seg_offset = 0, offset = 0, i;

>>> +

>>> +     while (seg >= pkt_hdr->buf_hdr.segcount - seg_offset &&

>>> +            pkt_hdr->ref_hdr) {

>>> +             seg    -= (pkt_hdr->buf_hdr.segcount - seg_offset);

>>> +             offset  = pkt_hdr->ref_offset;

>>> +             pkt_hdr = pkt_hdr->ref_hdr;

>>> +             for (i = 0, seg_offset = 0;

>>> +                  i < pkt_hdr->buf_hdr.segcount;

>>> +                  i++, seg_offset++) {

>>> +                     if (offset < pkt_hdr->buf_hdr.seg[i].len)

>>> +                             break;

>>> +                     offset -= pkt_hdr->buf_hdr.seg[i].len;

>>> +             }

>>> +     }

>>>

>>> -     if (odp_unlikely(seg >= pkt_hdr->buf_hdr.segcount))

>>> +     if (odp_unlikely(seg + seg_offset >= pkt_hdr->buf_hdr.segcount))

>>>               return NULL;

>>>

>>> -     return packet_seg_data(pkt_hdr, seg);

>>> +     return packet_seg_data(pkt_hdr, seg + seg_offset) + offset;

>>>  }

>>>

>>>  uint32_t odp_packet_seg_data_len(odp_packet_t pkt, odp_packet_seg_t seg)

>>>  {

>>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>> +     uint32_t seg_offset = 0, offset = 0, i;

>>> +

>>> +     while (seg >= pkt_hdr->buf_hdr.segcount - seg_offset &&

>>> +            pkt_hdr->ref_hdr) {

>>> +             seg    -= (pkt_hdr->buf_hdr.segcount - seg_offset);

>>> +             offset  = pkt_hdr->ref_offset;

>>> +             pkt_hdr = pkt_hdr->ref_hdr;

>>> +             for (i = 0, seg_offset = 0;

>>> +                  i < pkt_hdr->buf_hdr.segcount;

>>> +                  i++, seg_offset++) {

>>> +                     if (offset < pkt_hdr->buf_hdr.seg[i].len)

>>> +                             break;

>>> +                     offset -= pkt_hdr->buf_hdr.seg[i].len;

>>> +             }

>>> +     }

>>>

>>> -     if (odp_unlikely(seg >= pkt_hdr->buf_hdr.segcount))

>>> +     if (odp_unlikely(seg + seg_offset >= pkt_hdr->buf_hdr.segcount))

>>>               return 0;

>>>

>>> -     return packet_seg_len(pkt_hdr, seg);

>>> +     return packet_seg_len(pkt_hdr, seg + seg_offset) - offset;

>>>  }

>>>

>>>  /*

>>> @@ -1428,12 +1639,14 @@ int odp_packet_add_data(odp_packet_t *pkt_ptr, uint32_t offset,

>>> uint32_t len)

>>>  {

>>>       odp_packet_t pkt = *pkt_ptr;

>>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>> -     uint32_t pktlen = pkt_hdr->frame_len;

>>> +     uint32_t pktlen = packet_len(pkt_hdr);

>>>       odp_packet_t newpkt;

>>>

>>>       if (offset > pktlen)

>>>               return -1;

>>>

>>> +     ODP_ASSERT(odp_packet_unshared_len(*pkt_ptr) >= offset);

>>> +

>>>       newpkt = odp_packet_alloc(pkt_hdr->buf_hdr.pool_hdl, pktlen + len);

>>>

>>>       if (newpkt == ODP_PACKET_INVALID)

>>> @@ -1496,6 +1709,8 @@ int odp_packet_align(odp_packet_t *pkt, uint32_t offset, uint32_t

>>> len,

>>>       if (align > ODP_CACHE_LINE_SIZE)

>>>               return -1;

>>>

>>> +     ODP_ASSERT(odp_packet_has_ref(*pkt) == 0);

>>> +

>>>       if (seglen >= len) {

>>>               misalign = align <= 1 ? 0 :

>>>                       ODP_ALIGN_ROUNDUP(uaddr, align) - uaddr;

>>> @@ -1535,10 +1750,13 @@ int odp_packet_concat(odp_packet_t *dst, odp_packet_t src)

>>>       uint32_t dst_len    = dst_hdr->frame_len;

>>>       uint32_t src_len    = src_hdr->frame_len;

>>>

>>> +     ODP_ASSERT(packet_ref_count(dst_hdr) == 1);

>>> +

>>>       /* Do a copy if resulting packet would be out of segments or packets

>>> -      * are from different pools. */

>>> +      * are from different pools or src is a reference. */

>>>       if (odp_unlikely((dst_segs + src_segs) > CONFIG_PACKET_MAX_SEGS) ||

>>> -         odp_unlikely(dst_pool != src_pool)) {

>>> +         odp_unlikely(dst_pool != src_pool) ||

>>> +         odp_unlikely(packet_ref_count(src_hdr)) > 1) {

>>>               if (odp_packet_extend_tail(dst, src_len, NULL, NULL) >= 0) {

>>>                       (void)odp_packet_copy_from_pkt(*dst, dst_len,

>>>                                                      src, 0, src_len);

>>> @@ -1553,8 +1771,9 @@ int odp_packet_concat(odp_packet_t *dst, odp_packet_t src)

>>>

>>>       add_all_segs(dst_hdr, src_hdr);

>>>

>>> -     dst_hdr->frame_len = dst_len + src_len;

>>> -     dst_hdr->tailroom  = src_hdr->tailroom;

>>> +     dst_hdr->frame_len    = dst_len + src_len;

>>> +     dst_hdr->unshared_len = dst_len + src_len;

>>> +     dst_hdr->tailroom     = src_hdr->tailroom;

>>>

>>>       /* Data was not moved in memory */

>>>       return 0;

>>> @@ -1567,6 +1786,7 @@ int odp_packet_split(odp_packet_t *pkt, uint32_t len, odp_packet_t

>>> *tail)

>>>       if (len >= pktlen || tail == NULL)

>>>               return -1;

>>>

>>> +     ODP_ASSERT(odp_packet_unshared_len(*pkt) >= len);

>>>       *tail = odp_packet_copy_part(*pkt, len, pktlen - len,

>>>                                    odp_packet_pool(*pkt));

>>>

>>> @@ -1577,6 +1797,109 @@ int odp_packet_split(odp_packet_t *pkt, uint32_t len, odp_packet_t

>>> *tail)

>>>  }

>>>

>>>  /*

>>> + * References

>>> + */

>>> +

>>> +static inline void packet_ref(odp_packet_hdr_t *pkt_hdr)

>>> +{

>>> +     uint32_t i;

>>> +     odp_packet_hdr_t *hdr;

>>> +

>>> +     do {

>>> +             for (i = 0; i < pkt_hdr->buf_hdr.segcount; i++) {

>>> +                     hdr = pkt_hdr->buf_hdr.seg[i].hdr;

>>> +                     packet_ref_inc(hdr);

>>> +             }

>>> +

>>> +             pkt_hdr = pkt_hdr->ref_hdr;

>>> +     } while (pkt_hdr);

>>> +}

>>> +

>>> +static inline odp_packet_t packet_splice(odp_packet_hdr_t *pkt_hdr,

>>> +                                      uint32_t offset,

>>> +                                      odp_packet_hdr_t *ref_hdr)

>>> +{

>>> +     /* Catch attempted references to stale handles in debug builds */

>>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) > 0);

>>> +

>>> +     /* Splicing is from the last section of src pkt */

>>> +     while (ref_hdr->ref_hdr)

>>> +             ref_hdr = ref_hdr->ref_hdr;

>>> +

>>> +     /* Find section where splice begins */

>>> +     while (offset >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

>>> +             offset   -= (pkt_hdr->frame_len - pkt_hdr->ref_offset);

>>> +             offset   += (pkt_hdr->ref_hdr->frame_len - pkt_hdr->ref_len);

>>> +             pkt_hdr   = pkt_hdr->ref_hdr;

>>> +     }

>>> +

>>> +     ref_hdr->ref_hdr    = pkt_hdr;

>>> +     ref_hdr->ref_offset = offset;

>>> +     ref_hdr->ref_len    = pkt_hdr->frame_len;

>>> +

>>> +     if (offset < pkt_hdr->unshared_len)

>>> +             pkt_hdr->unshared_len = offset;

>>> +

>>> +     packet_ref(pkt_hdr);

>>> +     return _odp_packet_hdl(ref_hdr);

>>> +}

>>> +

>>> +odp_packet_t odp_packet_ref_static(odp_packet_t pkt)

>>> +{

>>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>> +

>>> +     pkt_hdr->unshared_len = 0;

>>> +     packet_ref(pkt_hdr);

>>> +     return pkt;

>>> +}

>>> +

>>> +odp_packet_t odp_packet_ref(odp_packet_t pkt, uint32_t offset)

>>> +{

>>> +     odp_packet_t hdr;

>>> +     odp_packet_hdr_t *pkt_hdr;

>>> +

>>> +     if (pkt == ODP_PACKET_INVALID)

>>> +             return ODP_PACKET_INVALID;

>>> +

>>> +     pkt_hdr = odp_packet_hdr(pkt);

>>> +     if (offset >= packet_len(pkt_hdr))

>>> +             return ODP_PACKET_INVALID;

>>> +

>>> +     hdr = odp_packet_alloc(odp_packet_pool(pkt), 0);

>>> +

>>> +     if (hdr == ODP_PACKET_INVALID)

>>> +             return ODP_PACKET_INVALID;

>>> +

>>> +     return packet_splice(pkt_hdr, offset, odp_packet_hdr(hdr));

>>> +}

>>> +

>>> +odp_packet_t odp_packet_ref_pkt(odp_packet_t pkt, uint32_t offset,

>>> +                             odp_packet_t hdr)

>>> +{

>>> +     odp_packet_hdr_t *pkt_hdr;

>>> +

>>> +     if (pkt == ODP_PACKET_INVALID ||

>>> +         hdr == ODP_PACKET_INVALID ||

>>> +         pkt == hdr)

>>> +             return ODP_PACKET_INVALID;

>>> +

>>> +     ODP_ASSERT(odp_packet_has_ref(hdr) == 0);

>>> +

>>> +     pkt_hdr = odp_packet_hdr(pkt);

>>> +     if (offset >= packet_len(pkt_hdr))

>>> +             return ODP_PACKET_INVALID;

>>> +

>>> +     return packet_splice(pkt_hdr, offset, odp_packet_hdr(hdr));

>>> +}

>>> +

>>> +int odp_packet_has_ref(odp_packet_t pkt)

>>> +{

>>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>> +

>>> +     return pkt_hdr->ref_hdr != NULL || packet_ref_count(pkt_hdr) > 1;

>>> +}

>>> +

>>> +/*

>>>   *

>>>   * Copy

>>>   * ********************************************************

>>> @@ -1585,8 +1908,7 @@ int odp_packet_split(odp_packet_t *pkt, uint32_t len, odp_packet_t

>>> *tail)

>>>

>>>  odp_packet_t odp_packet_copy(odp_packet_t pkt, odp_pool_t pool)

>>>  {

>>> -     odp_packet_hdr_t *srchdr = odp_packet_hdr(pkt);

>>> -     uint32_t pktlen = srchdr->frame_len;

>>> +     uint32_t pktlen = odp_packet_len(pkt);

>>>       odp_packet_t newpkt = odp_packet_alloc(pool, pktlen);

>>>

>>>       if (newpkt != ODP_PACKET_INVALID) {

>>> @@ -1625,7 +1947,7 @@ int odp_packet_copy_to_mem(odp_packet_t pkt, uint32_t offset,

>>>       uint8_t *dstaddr = (uint8_t *)dst;

>>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>>

>>> -     if (offset + len > pkt_hdr->frame_len)

>>> +     if (offset + len > packet_len(pkt_hdr))

>>>               return -1;

>>>

>>>       while (len > 0) {

>>> @@ -1649,9 +1971,11 @@ int odp_packet_copy_from_mem(odp_packet_t pkt, uint32_t offset,

>>>       const uint8_t *srcaddr = (const uint8_t *)src;

>>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>>>

>>> -     if (offset + len > pkt_hdr->frame_len)

>>> +     if (offset + len > packet_len(pkt_hdr))

>>>               return -1;

>>>

>>> +     ODP_ASSERT(odp_packet_unshared_len(pkt) >= offset + len);

>>> +

>>>       while (len > 0) {

>>>               mapaddr = packet_map(pkt_hdr, offset, &seglen, NULL);

>>>               cpylen = len > seglen ? seglen : len;

>>> @@ -1677,10 +2001,12 @@ int odp_packet_copy_from_pkt(odp_packet_t dst, uint32_t

>>> dst_offset,

>>>       uint32_t src_seglen = 0; /* GCC */

>>>       int overlap;

>>>

>>> -     if (dst_offset + len > dst_hdr->frame_len ||

>>> -         src_offset + len > src_hdr->frame_len)

>>> +     if (dst_offset + len > packet_len(dst_hdr) ||

>>> +         src_offset + len > packet_len(src_hdr))

>>>               return -1;

>>>

>>> +     ODP_ASSERT(odp_packet_unshared_len(dst) >= dst_offset + len);

>>> +

>>>       overlap = (dst_hdr == src_hdr &&

>>>                  ((dst_offset <= src_offset &&

>>>                    dst_offset + len >= src_offset) ||

>>> @@ -1764,7 +2090,7 @@ void odp_packet_print(odp_packet_t pkt)

>>>       len += snprintf(&str[len], n - len,

>>>                       "  l4_offset    %" PRIu32 "\n", hdr->p.l4_offset);

>>>       len += snprintf(&str[len], n - len,

>>> -                     "  frame_len    %" PRIu32 "\n", hdr->frame_len);

>>> +                     "  frame_len    %" PRIu32 "\n", packet_len(hdr));

>>>       len += snprintf(&str[len], n - len,

>>>                       "  input        %" PRIu64 "\n",

>>>                       odp_pktio_to_u64(hdr->input));

>>> --

>>> 2.9.3

>>
François Ozog Feb. 18, 2017, 3:57 p.m. UTC | #4
Well, problem is still there.
You are doing something on a packet that may not exist anymore.

On 17 February 2017 at 22:08, Bill Fischofer <bill.fischofer@linaro.org>
wrote:

> I've posted patch http://patches.opendataplane.org/patch/8155/ to

> address this issue.  It goes on api-next on top of patches

> http://patches.opendataplane.org/patch/7879/ and

> http://patches.opendataplane.org/patch/8154/

>

> On Fri, Feb 17, 2017 at 2:39 PM, Bill Fischofer

> <bill.fischofer@linaro.org> wrote:

> > First off, thank you very much for this review.

> >

> > Please note that this code has been streamlined in patch

> > http://patches.opendataplane.org/patch/7879/ and has been further

> > refined with patch http://patches.opendataplane.org/patch/8145/ but

> > the exposure you identify still exists in that code.

> >

> > On Fri, Feb 17, 2017 at 11:31 AM, Peltonen, Janne (Nokia - FI/Espoo)

> > <janne.peltonen@nokia.com> wrote:

> >> Hi,

> >>

> >> I took a look at the packet references and it seems to me that

> >> either the implementation is a bit racy or I confused myself

> >> when reading the code. Or maybe I got the intended concurrency

> >> semantics of the packet references wrong?

> >>

> >> My first issue is that packet_free() may access freed packet

> >> header or corrupt unshared_len.

> >>

> >> The packet free function looks like this:

> >>

> >> static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

> >> {

> >>         odp_packet_hdr_t *ref_hdr;

> >>         uint32_t ref_count;

> >>

> >>         do {

> >>                 ref_hdr = pkt_hdr->ref_hdr;

> >>                 ref_count = packet_ref_count(pkt_hdr) - 1;

> >>                 free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

> >>

> >>                 if (ref_count == 1)

> >>                         pkt_hdr->unshared_len = pkt_hdr->frame_len;

> >>

> >>                 pkt_hdr = ref_hdr;

> >>         } while (pkt_hdr);

> >> }

> >>

> >> The problem here is that decrementing the ref_count, checking

> >> its value and updating unshared_len is not single atomic

> >> operation. By the time packet_free() checks if ref_count == 1

> >> (i.e. if there is exactly one other reference left somewhere),

> >> the true ref_count may have already been changed by another

> >> thread doing packet_free() or packet_ref().

> >>

> >> For example, if two threads have a reference to the same packet

> >> then execution (or the relevant memory ops) may get "interleaved"

> >> as follows:

> >>

> >> T1: call packet_free()

> >> T1: ref_count = packet_ref_count(pkt_hdr) - 1;

> >> At this point ref_count variable is 1

> >> T1: call free_bufs()

> >> T1: call packet_ref_dec()

> >> Now the ref_count of the packet header is 1.

> >> T2: call and complete packet_free()

> >> Thread 2 sees refcount 1 in the packet and frees the buffers

> >> T1: pkt_hdr->unshared_len = pkt_hdr->frame_len;

> >> Thread 1 accesses freed buffer for reading and writing.

> >

> > I agree. These steps should be reversed so that the code should read:

> >

> > if (ref_count == 1)

> >    pkt_hdr->unshared_len = pkt_hdr->frame_len;

> >

> > free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

> >

> > Or using the code with the above two patches applied, the code should

> read:

> >

> > static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

> > {

> >         odp_packet_hdr_t *ref_hdr;

> >         uint32_t ref_count;

> >         int num_seg;

> >

> >         do {

> >                 ref_count = packet_ref_count(pkt_hdr);

> >                 num_seg = pkt_hdr->buf_hdr.segcount;

> >                 ref_hdr = pkt_hdr->ref_hdr;

> >

> >                 if (odp_likely((CONFIG_PACKET_MAX_SEGS == 1 || num_seg

> == 1) &&

> >                     ref_count == 1)) {

> >                         buffer_free_multi((odp_buffer_t

> > *)&pkt_hdr->buf_hdr.handle.handle, 1);

> >                 } else {

> >                         if (ref_count == 2)

> >                                 pkt_hdr->unshared_len =

> pkt_hdr->frame_len;

> >

> >                         free_bufs(pkt_hdr, 0, num_seg);

> >                  }

> >

> >                  pkt_hdr = ref_hdr;

> >         } while (pkt_hdr);

> > }

> >

> > The mistake was trying to optimize things so that unshared_len is not

> > set if the buffers are being freed, but that exposes these race

> > conditions. So the worst that should now happen is that it is set

> > unnecessarily before being freed.

> >

> > If you concur I'll fold this fix into a v3 for patch

> > http://patches.opendataplane.org/patch/8145/

> >

> >>

> >> Similarly, if T2 created a new reference, T1 would have

> >> a wrong idea of the number of remaining references and

> >> would adjust the unshared_len to an incorrect value.

> >>

> >> Right?

> >>

> >> Maybe other modifications of unshared_len are also racy.

> >

> > I don't believe so, because references do not change the existing ODP

> > restriction that two threads cannot share the same odp_packet_t.  When

> > a packet reference is created it returns a separate odp_packet_t that

> > has its own metadata. So unshared_len is always private to an

> > individual odp_packet_t. The exception is static references but in

> > this case the entire

> > packet along with its metadata must be treated as read only so

> > operations like odp_packet_push_head() that would try to modify

> > unshared_len are prohibited.

> >

> >>

> >>

> >>

> >> The second issue is that the atomic ops for setting and

> >> reading the ref count seem to have too relaxed memory

> >> ordering. In particular, packet_ref_dec() must not happen

> >> (be visible to other threads) before its caller is done

> >> with the packet and the related memory accesses have

> >> completed. Now there does not seem to be any optimization

> >> and memory barrier to prevent the ref count decrementing

> >> happening too early. So I think it is at least theoretically

> >> possible that a thread e.g. reads from a packet buffer

> >> after it has already been freed by another thread, somehow

> >> like this:

> >>

> >> Source code order:

> >> T1: interesting_data = read_from_pkt(pkt)

> >> T1: packet_free(pkt)

> >>

> >> Order visible to T2:

> >> 1: ref count decr

> >> 2: read from pkt

> >>

> >> Now if T2 goes and frees the remaining reference between

> >> steps 1 and 2, T1 may get even more interesting data.

> >>

> >> Right?

> >

> > I don't believe so. The semantics of odp_atomic_fetch_dec_u32(), which

> > is what packet_ref_dec() uses, says that no two calls can see the same

> > fetched value, so only one thread will return ref_count == 1 and free

> > the buffer. Note that if I see ref_count == 1 no other thread can be

> > trying to increment it via a concurrent odp_packet_ref() call because

> > that would mean that two threads were trying to manipulate the same

> > odp_packet_t, which is prohibited.

> >

> > For CPUs that support out of order instruction execution, this is only

> > permitted providing the reordering and speculative executions are

> > semantically consistent with sequential execution. If this were not

> > the case you'd constantly have to worry about a processor turning

> >

> > T1: interesting_data = read_from_pkt(pkt)

> > T1: packet_free(pkt)

> >

> > into

> >

> > T1: packet_free(pkt)

> > T1: interesting_data = read_from_pkt(pkt)

> >

> > In your scenario above: T2 cannot be issuing a read to pkt after

> > ref_count is decremented because the only way that T2 could be

> > decrementing ref_count would be if T2 issued an odp_packet_free() call

> > for it. Obviously if it tries to reference pkt after such a call that

> > is an application error.

> >

> > Thanks again for your much-appreciated help in looking at this!

> >

> >>

> >>         Janne

> >>

> >>

> >>> -----Original Message-----

> >>> From: lng-odp [mailto:lng-odp-bounces@lists.linaro.org] On Behalf Of

> Bill Fischofer

> >>> Sent: Wednesday, January 11, 2017 4:34 AM

> >>> To: lng-odp@lists.linaro.org

> >>> Subject: [lng-odp] [API-NEXT PATCHv7 2/5] linux-generic: packet:

> implement reference apis

> >>>

> >>> Implement the APIs:

> >>> - odp_packet_ref_static()

> >>> - odp_packet_ref()

> >>> - odp_packet_ref_pkt()

> >>> - odp_packet_has_ref()

> >>> - odp_packet_unshared_len()

> >>>

> >>> This also involves functional upgrades to the existing packet

> manipulation

> >>> APIs to work with packet references as input arguments.

> >>>

> >>> Signed-off-by: Bill Fischofer <bill.fischofer@linaro.org>

> >>> ---

> >>>  .../linux-generic/include/odp_packet_internal.h    |  87 +++-

> >>>  platform/linux-generic/odp_packet.c                | 536

> +++++++++++++++++----

> >>>  2 files changed, 516 insertions(+), 107 deletions(-)

> >>>

> >>> diff --git a/platform/linux-generic/include/odp_packet_internal.h

> b/platform/linux-

> >>> generic/include/odp_packet_internal.h

> >>> index e6e9d74..607560d 100644

> >>> --- a/platform/linux-generic/include/odp_packet_internal.h

> >>> +++ b/platform/linux-generic/include/odp_packet_internal.h

> >>> @@ -19,6 +19,7 @@ extern "C" {

> >>>

> >>>  #include <odp/api/align.h>

> >>>  #include <odp/api/debug.h>

> >>> +#include <odp_debug_internal.h>

> >>>  #include <odp_buffer_internal.h>

> >>>  #include <odp_pool_internal.h>

> >>>  #include <odp_buffer_inlines.h>

> >>> @@ -168,7 +169,7 @@ typedef struct {

> >>>   * packet_init(). Because of this any new fields added must be

> reviewed for

> >>>   * initialization requirements.

> >>>   */

> >>> -typedef struct {

> >>> +typedef struct odp_packet_hdr_t {

> >>>       /* common buffer header */

> >>>       odp_buffer_hdr_t buf_hdr;

> >>>

> >>> @@ -184,6 +185,13 @@ typedef struct {

> >>>       uint32_t headroom;

> >>>       uint32_t tailroom;

> >>>

> >>> +     /* Fields used to support packet references */

> >>> +     uint32_t unshared_len;

> >>> +     struct odp_packet_hdr_t *ref_hdr;

> >>> +     uint32_t ref_offset;

> >>> +     uint32_t ref_len;

> >>> +     odp_atomic_u32_t ref_count;

> >>> +

> >>>       /*

> >>>        * Members below are not initialized by packet_init()

> >>>        */

> >>> @@ -212,6 +220,55 @@ static inline odp_packet_hdr_t

> *odp_packet_hdr(odp_packet_t pkt)

> >>>       return (odp_packet_hdr_t *)buf_hdl_to_hdr((odp_buffer_t)pkt);

> >>>  }

> >>>

> >>> +static inline odp_packet_hdr_t *odp_packet_last_hdr(odp_packet_t pkt,

> >>> +                                                 uint32_t *offset)

> >>> +{

> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>> +     odp_packet_hdr_t *prev_hdr = pkt_hdr;

> >>> +     uint32_t ref_offset = 0;

> >>> +

> >>> +     while (pkt_hdr->ref_hdr) {

> >>> +             ref_offset = pkt_hdr->ref_offset;

> >>> +             prev_hdr   = pkt_hdr;

> >>> +             pkt_hdr    = pkt_hdr->ref_hdr;

> >>> +     }

> >>> +

> >>> +     if (offset) {

> >>> +             if (prev_hdr != pkt_hdr)

> >>> +                     ref_offset += pkt_hdr->frame_len -

> prev_hdr->ref_len;

> >>> +             *offset = ref_offset;

> >>> +     }

> >>> +

> >>> +     return pkt_hdr;

> >>> +}

> >>> +

> >>> +static inline odp_packet_hdr_t *odp_packet_prev_hdr(odp_packet_hdr_t

> *pkt_hdr,

> >>> +                                                 odp_packet_hdr_t

> *cur_hdr,

> >>> +                                                 uint32_t *offset)

> >>> +{

> >>> +     uint32_t ref_offset = 0;

> >>> +     odp_packet_hdr_t *prev_hdr = pkt_hdr;

> >>> +

> >>> +     while (pkt_hdr->ref_hdr != cur_hdr) {

> >>> +             ref_offset = pkt_hdr->ref_offset;

> >>> +             prev_hdr   = pkt_hdr;

> >>> +             pkt_hdr    = pkt_hdr->ref_hdr;

> >>> +     }

> >>> +

> >>> +     if (offset) {

> >>> +             if (prev_hdr != pkt_hdr)

> >>> +                     ref_offset += pkt_hdr->frame_len -

> prev_hdr->ref_len;

> >>> +             *offset = ref_offset;

> >>> +     }

> >>> +

> >>> +     return pkt_hdr;

> >>> +}

> >>> +

> >>> +static inline odp_packet_t _odp_packet_hdl(odp_packet_hdr_t *pkt_hdr)

> >>> +{

> >>> +     return (odp_packet_t)odp_hdr_to_buf(&pkt_hdr->buf_hdr);

> >>> +}

> >>> +

> >>>  static inline void copy_packet_parser_metadata(odp_packet_hdr_t

> *src_hdr,

> >>>                                              odp_packet_hdr_t *dst_hdr)

> >>>  {

> >>> @@ -234,17 +291,43 @@ static inline void pull_tail(odp_packet_hdr_t

> *pkt_hdr, uint32_t

> >>> len)

> >>>

> >>>       pkt_hdr->tailroom  += len;

> >>>       pkt_hdr->frame_len -= len;

> >>> +     pkt_hdr->unshared_len -= len;

> >>>       pkt_hdr->buf_hdr.seg[last].len -= len;

> >>>  }

> >>>

> >>>  static inline uint32_t packet_len(odp_packet_hdr_t *pkt_hdr)

> >>>  {

> >>> -     return pkt_hdr->frame_len;

> >>> +     uint32_t pkt_len = 0;

> >>> +     uint32_t offset  = 0;

> >>> +

> >>> +     do {

> >>> +             pkt_len += pkt_hdr->frame_len - offset;

> >>> +             offset   = pkt_hdr->ref_offset;

> >>> +             if (pkt_hdr->ref_hdr)

> >>> +                     offset += (pkt_hdr->ref_hdr->frame_len -

> >>> +                                pkt_hdr->ref_len);

> >>> +             pkt_hdr  = pkt_hdr->ref_hdr;

> >>> +     } while (pkt_hdr);

> >>> +

> >>> +     return pkt_len;

> >>> +}

> >>> +

> >>> +static inline uint32_t packet_ref_count(odp_packet_hdr_t *pkt_hdr)

> >>> +{

> >>> +     return odp_atomic_load_u32(&pkt_hdr->ref_count);

> >>> +}

> >>> +

> >>> +static inline void packet_ref_count_set(odp_packet_hdr_t *pkt_hdr,

> uint32_t n)

> >>> +{

> >>> +     odp_atomic_init_u32(&pkt_hdr->ref_count, n);

> >>>  }

> >>>

> >>>  static inline void packet_set_len(odp_packet_hdr_t *pkt_hdr,

> uint32_t len)

> >>>  {

> >>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

> >>> +

> >>>       pkt_hdr->frame_len = len;

> >>> +     pkt_hdr->unshared_len = len;

> >>>  }

> >>>

> >>>  static inline int packet_parse_l2_not_done(packet_parser_t *prs)

> >>> diff --git a/platform/linux-generic/odp_packet.c

> b/platform/linux-generic/odp_packet.c

> >>> index f632a51..170965a 100644

> >>> --- a/platform/linux-generic/odp_packet.c

> >>> +++ b/platform/linux-generic/odp_packet.c

> >>> @@ -33,13 +33,24 @@ static inline odp_buffer_t

> buffer_handle(odp_packet_hdr_t *pkt_hdr)

> >>>       return pkt_hdr->buf_hdr.handle.handle;

> >>>  }

> >>>

> >>> +static inline uint32_t packet_ref_inc(odp_packet_hdr_t *pkt_hdr)

> >>> +{

> >>> +     return odp_atomic_fetch_inc_u32(&pkt_hdr->ref_count);

> >>> +}

> >>> +

> >>> +static inline uint32_t packet_ref_dec(odp_packet_hdr_t *pkt_hdr)

> >>> +{

> >>> +     return odp_atomic_fetch_dec_u32(&pkt_hdr->ref_count);

> >>> +}

> >>> +

> >>>  static inline uint32_t packet_seg_len(odp_packet_hdr_t *pkt_hdr,

> >>>                                     uint32_t seg_idx)

> >>>  {

> >>>       return pkt_hdr->buf_hdr.seg[seg_idx].len;

> >>>  }

> >>>

> >>> -static inline void *packet_seg_data(odp_packet_hdr_t *pkt_hdr,

> uint32_t seg_idx)

> >>> +static inline uint8_t *packet_seg_data(odp_packet_hdr_t *pkt_hdr,

> >>> +                                    uint32_t seg_idx)

> >>>  {

> >>>       return pkt_hdr->buf_hdr.seg[seg_idx].data;

> >>>  }

> >>> @@ -52,6 +63,11 @@ static inline int packet_last_seg(odp_packet_hdr_t

> *pkt_hdr)

> >>>               return pkt_hdr->buf_hdr.segcount - 1;

> >>>  }

> >>>

> >>> +static inline void *packet_data(odp_packet_hdr_t *pkt_hdr)

> >>> +{

> >>> +     return pkt_hdr->buf_hdr.seg[0].data;

> >>> +}

> >>> +

> >>>  static inline uint32_t packet_first_seg_len(odp_packet_hdr_t

> *pkt_hdr)

> >>>  {

> >>>       return packet_seg_len(pkt_hdr, 0);

> >>> @@ -64,11 +80,6 @@ static inline uint32_t packet_last_seg_len(odp_packet_hdr_t

> *pkt_hdr)

> >>>       return packet_seg_len(pkt_hdr, last);

> >>>  }

> >>>

> >>> -static inline void *packet_data(odp_packet_hdr_t *pkt_hdr)

> >>> -{

> >>> -     return pkt_hdr->buf_hdr.seg[0].data;

> >>> -}

> >>> -

> >>>  static inline void *packet_tail(odp_packet_hdr_t *pkt_hdr)

> >>>  {

> >>>       int last = packet_last_seg(pkt_hdr);

> >>> @@ -99,6 +110,7 @@ static inline void push_head(odp_packet_hdr_t

> *pkt_hdr, uint32_t len)

> >>>  {

> >>>       pkt_hdr->headroom  -= len;

> >>>       pkt_hdr->frame_len += len;

> >>> +     pkt_hdr->unshared_len += len;

> >>>       pkt_hdr->buf_hdr.seg[0].data -= len;

> >>>       pkt_hdr->buf_hdr.seg[0].len  += len;

> >>>  }

> >>> @@ -107,6 +119,7 @@ static inline void pull_head(odp_packet_hdr_t

> *pkt_hdr, uint32_t len)

> >>>  {

> >>>       pkt_hdr->headroom  += len;

> >>>       pkt_hdr->frame_len -= len;

> >>> +     pkt_hdr->unshared_len -= len;

> >>>       pkt_hdr->buf_hdr.seg[0].data += len;

> >>>       pkt_hdr->buf_hdr.seg[0].len  -= len;

> >>>  }

> >>> @@ -117,6 +130,7 @@ static inline void push_tail(odp_packet_hdr_t

> *pkt_hdr, uint32_t len)

> >>>

> >>>       pkt_hdr->tailroom  -= len;

> >>>       pkt_hdr->frame_len += len;

> >>> +     pkt_hdr->unshared_len += len;

> >>>       pkt_hdr->buf_hdr.seg[last].len += len;

> >>>  }

> >>>

> >>> @@ -144,6 +158,10 @@ static inline void packet_seg_copy_md(odp_packet_hdr_t

> *dst,

> >>>       dst->buf_hdr.uarea_addr = src->buf_hdr.uarea_addr;

> >>>       dst->buf_hdr.uarea_size = src->buf_hdr.uarea_size;

> >>>

> >>> +     /* reference related metadata */

> >>> +     dst->ref_len      = src->ref_len;

> >>> +     dst->unshared_len = src->unshared_len;

> >>> +

> >>>       /* segmentation data is not copied:

> >>>        *   buf_hdr.seg[]

> >>>        *   buf_hdr.segcount

> >>> @@ -158,7 +176,15 @@ static inline void *packet_map(odp_packet_hdr_t

> *pkt_hdr,

> >>>       int seg = 0;

> >>>       int seg_count = pkt_hdr->buf_hdr.segcount;

> >>>

> >>> -     if (odp_unlikely(offset >= pkt_hdr->frame_len))

> >>> +     /* Special processing for references */

> >>> +     while (offset >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

> >>> +             offset   -= (pkt_hdr->frame_len - pkt_hdr->ref_offset);

> >>> +             offset   += (pkt_hdr->ref_hdr->frame_len -

> pkt_hdr->ref_len);

> >>> +             pkt_hdr   = pkt_hdr->ref_hdr;

> >>> +             seg_count = pkt_hdr->buf_hdr.segcount;

> >>> +     }

> >>> +

> >>> +     if (odp_unlikely(offset > pkt_hdr->frame_len))

> >>>               return NULL;

> >>>

> >>>       if (odp_likely(CONFIG_PACKET_MAX_SEGS == 1 || seg_count == 1)) {

> >>> @@ -207,6 +233,9 @@ void packet_parse_reset(odp_packet_hdr_t *pkt_hdr)

> >>>       pkt_hdr->p.l2_offset        = 0;

> >>>       pkt_hdr->p.l3_offset        = ODP_PACKET_OFFSET_INVALID;

> >>>       pkt_hdr->p.l4_offset        = ODP_PACKET_OFFSET_INVALID;

> >>> +

> >>> +     /* Ensure dummy pkt_hdrs used in I/O recv classification are

> valid */

> >>> +     pkt_hdr->ref_hdr = NULL;

> >>>  }

> >>>

> >>>  /**

> >>> @@ -252,6 +281,10 @@ static inline void packet_init(odp_packet_hdr_t

> *pkt_hdr, uint32_t

> >>> len,

> >>>                            CONFIG_PACKET_TAILROOM;

> >>>

> >>>       pkt_hdr->input = ODP_PKTIO_INVALID;

> >>> +

> >>> +     /* By default packet has no references */

> >>> +     pkt_hdr->unshared_len = len;

> >>> +     pkt_hdr->ref_hdr = NULL;

> >>>  }

> >>>

> >>>  static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int num)

> >>> @@ -264,6 +297,7 @@ static inline void init_segments(odp_packet_hdr_t

> *pkt_hdr[], int num)

> >>>

> >>>       hdr->buf_hdr.seg[0].data = hdr->buf_hdr.base_data;

> >>>       hdr->buf_hdr.seg[0].len  = BASE_LEN;

> >>> +     packet_ref_count_set(hdr, 1);

> >>>

> >>>       /* Link segments */

> >>>       if (CONFIG_PACKET_MAX_SEGS != 1) {

> >>> @@ -273,6 +307,7 @@ static inline void init_segments(odp_packet_hdr_t

> *pkt_hdr[], int num)

> >>>                       for (i = 1; i < num; i++) {

> >>>                               odp_buffer_hdr_t *buf_hdr;

> >>>

> >>> +                             packet_ref_count_set(pkt_hdr[i], 1);

> >>>                               buf_hdr = &pkt_hdr[i]->buf_hdr;

> >>>                               hdr->buf_hdr.seg[i].hdr  = buf_hdr;

> >>>                               hdr->buf_hdr.seg[i].data =

> buf_hdr->base_data;

> >>> @@ -376,9 +411,10 @@ static inline odp_packet_hdr_t

> *add_segments(odp_packet_hdr_t

> >>> *pkt_hdr,

> >>>               new_hdr->buf_hdr.seg[0].len   = seg_len;

> >>>

> >>>               packet_seg_copy_md(new_hdr, pkt_hdr);

> >>> -             new_hdr->frame_len = pkt_hdr->frame_len + len;

> >>> -             new_hdr->headroom  = pool->headroom + offset;

> >>> -             new_hdr->tailroom  = pkt_hdr->tailroom;

> >>> +             new_hdr->frame_len    = pkt_hdr->frame_len + len;

> >>> +             new_hdr->unshared_len = pkt_hdr->unshared_len + len;

> >>> +             new_hdr->headroom     = pool->headroom + offset;

> >>> +             new_hdr->tailroom     = pkt_hdr->tailroom;

> >>>

> >>>               pkt_hdr = new_hdr;

> >>>       } else {

> >>> @@ -391,8 +427,9 @@ static inline odp_packet_hdr_t

> *add_segments(odp_packet_hdr_t

> >>> *pkt_hdr,

> >>>               last = packet_last_seg(pkt_hdr);

> >>>               pkt_hdr->buf_hdr.seg[last].len = seg_len;

> >>>

> >>> -             pkt_hdr->frame_len += len;

> >>> -             pkt_hdr->tailroom   = pool->tailroom + offset;

> >>> +             pkt_hdr->frame_len    += len;

> >>> +             pkt_hdr->unshared_len += len;

> >>> +             pkt_hdr->tailroom      = pool->tailroom + offset;

> >>>       }

> >>>

> >>>       return pkt_hdr;

> >>> @@ -400,13 +437,18 @@ static inline odp_packet_hdr_t

> *add_segments(odp_packet_hdr_t

> >>> *pkt_hdr,

> >>>

> >>>  static inline void free_bufs(odp_packet_hdr_t *pkt_hdr, int first,

> int num)

> >>>  {

> >>> -     int i;

> >>> +     int i, nfree;

> >>>       odp_buffer_t buf[num];

> >>>

> >>> -     for (i = 0; i < num; i++)

> >>> -             buf[i] = buffer_handle(pkt_hdr->buf_hdr.seg[first +

> i].hdr);

> >>> +     for (i = 0, nfree = 0; i < num; i++) {

> >>> +             odp_packet_hdr_t *hdr = pkt_hdr->buf_hdr.seg[first +

> i].hdr;

> >>> +

> >>> +             if (packet_ref_dec(hdr) == 1)

> >>> +                     buf[nfree++] = buffer_handle(hdr);

> >>> +     }

> >>>

> >>> -     buffer_free_multi(buf, num);

> >>> +     if (nfree > 0)

> >>> +             buffer_free_multi(buf, nfree);

> >>>  }

> >>>

> >>>  static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t

> *pkt_hdr,

> >>> @@ -417,11 +459,15 @@ static inline odp_packet_hdr_t

> *free_segments(odp_packet_hdr_t

> >>> *pkt_hdr,

> >>>

> >>>       if (head) {

> >>>               odp_packet_hdr_t *new_hdr;

> >>> -             int i;

> >>> +             int i, nfree;

> >>>               odp_buffer_t buf[num];

> >>>

> >>> -             for (i = 0; i < num; i++)

> >>> -                     buf[i] = buffer_handle(pkt_hdr->buf_

> hdr.seg[i].hdr);

> >>> +             for (i = 0, nfree = 0; i < num; i++) {

> >>> +                     new_hdr = pkt_hdr->buf_hdr.seg[i].hdr;

> >>> +

> >>> +                     if (packet_ref_dec(new_hdr) == 1)

> >>> +                             buf[nfree++] = buffer_handle(new_hdr);

> >>> +             }

> >>>

> >>>               /* First remaining segment is the new packet descriptor

> */

> >>>               new_hdr = pkt_hdr->buf_hdr.seg[num].hdr;

> >>> @@ -430,15 +476,17 @@ static inline odp_packet_hdr_t

> *free_segments(odp_packet_hdr_t

> >>> *pkt_hdr,

> >>>               packet_seg_copy_md(new_hdr, pkt_hdr);

> >>>

> >>>               /* Tailroom not changed */

> >>> -             new_hdr->tailroom  = pkt_hdr->tailroom;

> >>> -             new_hdr->headroom  = seg_headroom(new_hdr, 0);

> >>> -             new_hdr->frame_len = pkt_hdr->frame_len - free_len;

> >>> +             new_hdr->tailroom     = pkt_hdr->tailroom;

> >>> +             new_hdr->headroom     = seg_headroom(new_hdr, 0);

> >>> +             new_hdr->frame_len    = pkt_hdr->frame_len - free_len;

> >>> +             new_hdr->unshared_len = pkt_hdr->unshared_len - free_len;

> >>>

> >>>               pull_head(new_hdr, pull_len);

> >>>

> >>>               pkt_hdr = new_hdr;

> >>>

> >>> -             buffer_free_multi(buf, num);

> >>> +             if (nfree > 0)

> >>> +                     buffer_free_multi(buf, nfree);

> >>>       } else {

> >>>               /* Free last 'num' bufs */

> >>>               free_bufs(pkt_hdr, num_remain, num);

> >>> @@ -447,6 +495,7 @@ static inline odp_packet_hdr_t

> *free_segments(odp_packet_hdr_t

> >>> *pkt_hdr,

> >>>                * of the metadata. */

> >>>               pkt_hdr->buf_hdr.segcount = num_remain;

> >>>               pkt_hdr->frame_len -= free_len;

> >>> +             pkt_hdr->unshared_len -= free_len;

> >>>               pkt_hdr->tailroom = seg_tailroom(pkt_hdr, num_remain -

> 1);

> >>>

> >>>               pull_tail(pkt_hdr, pull_len);

> >>> @@ -550,45 +599,34 @@ int odp_packet_alloc_multi(odp_pool_t pool_hdl,

> uint32_t len,

> >>>       return num;

> >>>  }

> >>>

> >>> -void odp_packet_free(odp_packet_t pkt)

> >>> +static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

> >>>  {

> >>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>> -     int num_seg = pkt_hdr->buf_hdr.segcount;

> >>> +     odp_packet_hdr_t *ref_hdr;

> >>> +     uint32_t ref_count;

> >>>

> >>> -     if (odp_likely(CONFIG_PACKET_MAX_SEGS == 1 || num_seg == 1))

> >>> -             buffer_free_multi((odp_buffer_t *)&pkt, 1);

> >>> -     else

> >>> -             free_bufs(pkt_hdr, 0, num_seg);

> >>> -}

> >>> +     do {

> >>> +             ref_hdr = pkt_hdr->ref_hdr;

> >>> +             ref_count = packet_ref_count(pkt_hdr) - 1;

> >>> +             free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

> >>>

> >>> -void odp_packet_free_multi(const odp_packet_t pkt[], int num)

> >>> -{

> >>> -     if (CONFIG_PACKET_MAX_SEGS == 1) {

> >>> -             buffer_free_multi((const odp_buffer_t * const)pkt, num);

> >>> -     } else {

> >>> -             odp_buffer_t buf[num * CONFIG_PACKET_MAX_SEGS];

> >>> -             int i, j;

> >>> -             int bufs = 0;

> >>> +             if (ref_count == 1)

> >>> +                     pkt_hdr->unshared_len = pkt_hdr->frame_len;

> >>>

> >>> -             for (i = 0; i < num; i++) {

> >>> -                     odp_packet_hdr_t *pkt_hdr =

> odp_packet_hdr(pkt[i]);

> >>> -                     int num_seg = pkt_hdr->buf_hdr.segcount;

> >>> -                     odp_buffer_hdr_t *buf_hdr = &pkt_hdr->buf_hdr;

> >>> -

> >>> -                     buf[bufs] = (odp_buffer_t)pkt[i];

> >>> -                     bufs++;

> >>> +             pkt_hdr = ref_hdr;

> >>> +     } while (pkt_hdr);

> >>> +}

> >>>

> >>> -                     if (odp_likely(num_seg == 1))

> >>> -                             continue;

> >>> +void odp_packet_free(odp_packet_t pkt)

> >>> +{

> >>> +     packet_free(odp_packet_hdr(pkt));

> >>> +}

> >>>

> >>> -                     for (j = 1; j < num_seg; j++) {

> >>> -                             buf[bufs] =

> buffer_handle(buf_hdr->seg[j].hdr);

> >>> -                             bufs++;

> >>> -                     }

> >>> -             }

> >>> +void odp_packet_free_multi(const odp_packet_t pkt[], int num)

> >>> +{

> >>> +     int i;

> >>>

> >>> -             buffer_free_multi(buf, bufs);

> >>> -     }

> >>> +     for (i = 0; i < num; i++)

> >>> +             packet_free(odp_packet_hdr(pkt[i]));

> >>>  }

> >>>

> >>>  int odp_packet_reset(odp_packet_t pkt, uint32_t len)

> >>> @@ -599,6 +637,9 @@ int odp_packet_reset(odp_packet_t pkt, uint32_t

> len)

> >>>       if (len > pool->headroom + pool->data_size + pool->tailroom)

> >>>               return -1;

> >>>

> >>> +     if (pkt_hdr->ref_hdr)

> >>> +             packet_free(pkt_hdr->ref_hdr);

> >>> +

> >>>       packet_init(pkt_hdr, len, 0);

> >>>

> >>>       return 0;

> >>> @@ -641,15 +682,21 @@ void *odp_packet_head(odp_packet_t pkt)

> >>>  uint32_t odp_packet_buf_len(odp_packet_t pkt)

> >>>  {

> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>> +     uint32_t buf_len = 0;

> >>>

> >>> -     return pkt_hdr->buf_hdr.size * pkt_hdr->buf_hdr.segcount;

> >>> +     do {

> >>> +             buf_len += pkt_hdr->buf_hdr.size *

> pkt_hdr->buf_hdr.segcount;

> >>> +             pkt_hdr  = pkt_hdr->ref_hdr;

> >>> +     } while (pkt_hdr);

> >>> +

> >>> +     return buf_len;

> >>>  }

> >>>

> >>>  void *odp_packet_data(odp_packet_t pkt)

> >>>  {

> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>>

> >>> -     return packet_data(pkt_hdr);

> >>> +     return packet_map(pkt_hdr, 0, NULL, NULL);

> >>>  }

> >>>

> >>>  uint32_t odp_packet_seg_len(odp_packet_t pkt)

> >>> @@ -661,7 +708,32 @@ uint32_t odp_packet_seg_len(odp_packet_t pkt)

> >>>

> >>>  uint32_t odp_packet_len(odp_packet_t pkt)

> >>>  {

> >>> -     return odp_packet_hdr(pkt)->frame_len;

> >>> +     return packet_len(odp_packet_hdr(pkt));

> >>> +}

> >>> +

> >>> +uint32_t odp_packet_unshared_len(odp_packet_t pkt)

> >>> +{

> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>> +     uint32_t pkt_len = 0, offset = 0;

> >>> +

> >>> +     do {

> >>> +             if (packet_ref_count(pkt_hdr) > 1) {

> >>> +                     if (offset == 0)

> >>> +                             pkt_len += pkt_hdr->unshared_len;

> >>> +                     break;

> >>> +             }

> >>> +

> >>> +             pkt_len += pkt_hdr->frame_len - offset;

> >>> +             offset   = pkt_hdr->ref_offset;

> >>> +

> >>> +             if (pkt_hdr->ref_hdr)

> >>> +                     offset += (pkt_hdr->ref_hdr->frame_len -

> >>> +                                pkt_hdr->ref_len);

> >>> +

> >>> +             pkt_hdr = pkt_hdr->ref_hdr;

> >>> +     } while (pkt_hdr);

> >>> +

> >>> +     return pkt_len;

> >>>  }

> >>>

> >>>  uint32_t odp_packet_headroom(odp_packet_t pkt)

> >>> @@ -671,12 +743,12 @@ uint32_t odp_packet_headroom(odp_packet_t pkt)

> >>>

> >>>  uint32_t odp_packet_tailroom(odp_packet_t pkt)

> >>>  {

> >>> -     return odp_packet_hdr(pkt)->tailroom;

> >>> +     return odp_packet_last_hdr(pkt, NULL)->tailroom;

> >>>  }

> >>>

> >>>  void *odp_packet_tail(odp_packet_t pkt)

> >>>  {

> >>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(pkt, NULL);

> >>>

> >>>       return packet_tail(pkt_hdr);

> >>>  }

> >>> @@ -870,7 +942,7 @@ int odp_packet_extend_head(odp_packet_t *pkt,

> uint32_t len,

> >>>  {

> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

> >>>       uint32_t frame_len = pkt_hdr->frame_len;

> >>> -     uint32_t headroom  = pkt_hdr->headroom;

> >>> +     uint32_t headroom = pkt_hdr->headroom;

> >>>       int ret = 0;

> >>>

> >>>       if (len > headroom) {

> >>> @@ -885,6 +957,46 @@ int odp_packet_extend_head(odp_packet_t *pkt,

> uint32_t len,

> >>>               segs = pkt_hdr->buf_hdr.segcount;

> >>>

> >>>               if (odp_unlikely((segs + num) > CONFIG_PACKET_MAX_SEGS))

> {

> >>> +                     /* Handle recursively via references when

> >>> +                      * working with referenced packets since another

> >>> +                      * thread may be accessing it concurrently via

> >>> +                      * its reference to it. */

> >>> +                     if (packet_ref_count(pkt_hdr) > 1) {

> >>> +                             odp_packet_t ref;

> >>> +                             uint32_t unshared_len;

> >>> +

> >>> +                             push_head(pkt_hdr, headroom);

> >>> +                             unshared_len = pkt_hdr->unshared_len;

> >>> +                             ref = odp_packet_ref(*pkt, 0);

> >>> +

> >>> +                             if (ref == ODP_PACKET_INVALID) {

> >>> +                                     pull_head(pkt_hdr, headroom);

> >>> +                                     return -1;

> >>> +                             }

> >>> +

> >>> +                             ret = odp_packet_extend_head(&ref,

> >>> +                                                          len -

> headroom,

> >>> +                                                          data_ptr,

> >>> +                                                          seg_len);

> >>> +

> >>> +                             if (ret < 0) {

> >>> +                                     odp_packet_free(ref);

> >>> +                                     pull_head(pkt_hdr, headroom);

> >>> +                                     return -1;

> >>> +                             }

> >>> +

> >>> +                             /* Since this is a special ref, the

> >>> +                              * base pkt's unshared len is unchanged

> */

> >>> +                             pkt_hdr->unshared_len = unshared_len;

> >>> +

> >>> +                             /* Remove extra ref to the base pkt */

> >>> +                             odp_packet_free(*pkt);

> >>> +

> >>> +                             /* Return the ref as the extension

> result */

> >>> +                             *pkt = ref;

> >>> +                             return 1;

> >>> +                     }

> >>> +

> >>>                       /* Cannot directly add new segments */

> >>>                       odp_packet_hdr_t *new_hdr;

> >>>                       int new_segs = 0;

> >>> @@ -936,6 +1048,7 @@ int odp_packet_extend_head(odp_packet_t *pkt,

> uint32_t len,

> >>>

> >>>                       pkt_hdr->buf_hdr.segcount = segs;

> >>>                       pkt_hdr->frame_len        = frame_len;

> >>> +                     pkt_hdr->unshared_len     = frame_len;

> >>>                       pkt_hdr->headroom         = offset +

> pool->headroom;

> >>>                       pkt_hdr->tailroom         = pool->tailroom;

> >>>

> >>> @@ -961,11 +1074,16 @@ int odp_packet_extend_head(odp_packet_t *pkt,

> uint32_t len,

> >>>               push_head(pkt_hdr, len);

> >>>       }

> >>>

> >>> -     if (data_ptr)

> >>> -             *data_ptr = packet_data(pkt_hdr);

> >>> +     if (data_ptr || seg_len) {

> >>> +             uint32_t seg_ln = 0;

> >>> +             void *data = packet_map(pkt_hdr, 0, &seg_ln, NULL);

> >>>

> >>> -     if (seg_len)

> >>> -             *seg_len = packet_first_seg_len(pkt_hdr);

> >>> +             if (data_ptr)

> >>> +                     *data_ptr = data;

> >>> +

> >>> +             if (seg_len)

> >>> +                     *seg_len = seg_ln;

> >>> +     }

> >>>

> >>>       return ret;

> >>>  }

> >>> @@ -977,6 +1095,8 @@ void *odp_packet_pull_head(odp_packet_t pkt,

> uint32_t len)

> >>>       if (len > pkt_hdr->frame_len)

> >>>               return NULL;

> >>>

> >>> +     ODP_ASSERT(len <= pkt_hdr->unshared_len);

> >>> +

> >>>       pull_head(pkt_hdr, len);

> >>>       return packet_data(pkt_hdr);

> >>>  }

> >>> @@ -984,15 +1104,35 @@ void *odp_packet_pull_head(odp_packet_t pkt,

> uint32_t len)

> >>>  int odp_packet_trunc_head(odp_packet_t *pkt, uint32_t len,

> >>>                         void **data_ptr, uint32_t *seg_len_out)

> >>>  {

> >>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt), *nxt_hdr;

> >>>       uint32_t seg_len = packet_first_seg_len(pkt_hdr);

> >>> +     int ret = 0;

> >>>

> >>> -     if (len > pkt_hdr->frame_len)

> >>> +     if (len > packet_len(pkt_hdr))

> >>>               return -1;

> >>>

> >>> -     if (len < seg_len) {

> >>> +     ODP_ASSERT(len <= odp_packet_unshared_len(*pkt));

> >>> +

> >>> +     /* Special processing for references */

> >>> +     while (len >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

> >>> +             ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

> >>> +             nxt_hdr = pkt_hdr->ref_hdr;

> >>> +             len -= pkt_hdr->frame_len;

> >>> +             len += pkt_hdr->ref_offset +

> >>> +                     (nxt_hdr->frame_len - pkt_hdr->ref_len);

> >>> +             pkt_hdr->ref_hdr = NULL;

> >>> +             packet_free(pkt_hdr);

> >>> +             pkt_hdr = nxt_hdr;

> >>> +             seg_len = packet_first_seg_len(pkt_hdr);

> >>> +             *pkt = packet_handle(pkt_hdr);

> >>> +             ret = 1;

> >>> +     }

> >>> +

> >>> +     if (CONFIG_PACKET_MAX_SEGS == 1 ||

> >>> +         len < seg_len ||

> >>> +         pkt_hdr->buf_hdr.segcount == 1) {

> >>>               pull_head(pkt_hdr, len);

> >>> -     } else if (CONFIG_PACKET_MAX_SEGS != 1) {

> >>> +     } else {

> >>>               int num = 0;

> >>>               uint32_t pull_len = 0;

> >>>

> >>> @@ -1007,23 +1147,29 @@ int odp_packet_trunc_head(odp_packet_t *pkt,

> uint32_t len,

> >>>               *pkt    = packet_handle(pkt_hdr);

> >>>       }

> >>>

> >>> -     if (data_ptr)

> >>> -             *data_ptr = packet_data(pkt_hdr);

> >>> +     if (data_ptr || seg_len_out) {

> >>> +             void *data_head = packet_map(pkt_hdr, 0, &seg_len, NULL);

> >>>

> >>> -     if (seg_len_out)

> >>> -             *seg_len_out = packet_first_seg_len(pkt_hdr);

> >>> +             if (data_ptr)

> >>> +                     *data_ptr = data_head;

> >>>

> >>> -     return 0;

> >>> +             if (seg_len_out)

> >>> +                     *seg_len_out = seg_len;

> >>> +     }

> >>> +

> >>> +     return ret;

> >>>  }

> >>>

> >>>  void *odp_packet_push_tail(odp_packet_t pkt, uint32_t len)

> >>>  {

> >>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(pkt, NULL);

> >>>       void *old_tail;

> >>>

> >>>       if (len > pkt_hdr->tailroom)

> >>>               return NULL;

> >>>

> >>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

> >>> +

> >>>       old_tail = packet_tail(pkt_hdr);

> >>>       push_tail(pkt_hdr, len);

> >>>

> >>> @@ -1033,12 +1179,14 @@ void *odp_packet_push_tail(odp_packet_t pkt,

> uint32_t len)

> >>>  int odp_packet_extend_tail(odp_packet_t *pkt, uint32_t len,

> >>>                          void **data_ptr, uint32_t *seg_len_out)

> >>>  {

> >>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(*pkt, NULL);

> >>>       uint32_t frame_len = pkt_hdr->frame_len;

> >>>       uint32_t tailroom  = pkt_hdr->tailroom;

> >>>       uint32_t tail_off  = frame_len;

> >>>       int ret = 0;

> >>>

> >>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

> >>> +

> >>>       if (len > tailroom) {

> >>>               pool_t *pool = pool_entry_from_hdl(pkt_hdr->

> buf_hdr.pool_hdl);

> >>>               int num;

> >>> @@ -1129,6 +1277,7 @@ void *odp_packet_pull_tail(odp_packet_t pkt,

> uint32_t len)

> >>>       if (len > packet_last_seg_len(pkt_hdr))

> >>>               return NULL;

> >>>

> >>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

> >>>       pull_tail(pkt_hdr, len);

> >>>

> >>>       return packet_tail(pkt_hdr);

> >>> @@ -1139,17 +1288,34 @@ int odp_packet_trunc_tail(odp_packet_t *pkt,

> uint32_t len,

> >>>  {

> >>>       int last;

> >>>       uint32_t seg_len;

> >>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

> >>> +     uint32_t offset;

> >>> +     odp_packet_hdr_t *first_hdr = odp_packet_hdr(*pkt);

> >>> +     odp_packet_hdr_t *pkt_hdr, *prev_hdr;

> >>>

> >>> -     if (len > pkt_hdr->frame_len)

> >>> +     if (len > packet_len(first_hdr))

> >>>               return -1;

> >>>

> >>> +     pkt_hdr = odp_packet_last_hdr(*pkt, &offset);

> >>> +

> >>> +     /* Special processing for references */

> >>> +     while (len >= pkt_hdr->frame_len - offset && first_hdr->ref_hdr)

> {

> >>> +             len -= (pkt_hdr->frame_len - offset);

> >>> +             prev_hdr = odp_packet_prev_hdr(first_hdr, pkt_hdr,

> &offset);

> >>> +             ODP_ASSERT(packet_ref_count(prev_hdr) == 1);

> >>> +             prev_hdr->ref_hdr = NULL;

> >>> +             packet_free(pkt_hdr);

> >>> +             pkt_hdr = prev_hdr;

> >>> +     }

> >>> +

> >>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

> >>>       last    = packet_last_seg(pkt_hdr);

> >>>       seg_len = packet_seg_len(pkt_hdr, last);

> >>>

> >>> -     if (len < seg_len) {

> >>> +     if (CONFIG_PACKET_MAX_SEGS == 1 ||

> >>> +         len < seg_len ||

> >>> +         pkt_hdr->buf_hdr.segcount == 1) {

> >>>               pull_tail(pkt_hdr, len);

> >>> -     } else if (CONFIG_PACKET_MAX_SEGS != 1) {

> >>> +     } else {

> >>>               int num = 0;

> >>>               uint32_t pull_len = 0;

> >>>

> >>> @@ -1356,35 +1522,50 @@ void odp_packet_ts_set(odp_packet_t pkt,

> odp_time_t timestamp)

> >>>

> >>>  int odp_packet_is_segmented(odp_packet_t pkt)

> >>>  {

> >>> -     return odp_packet_hdr(pkt)->buf_hdr.segcount > 1;

> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>> +

> >>> +     return pkt_hdr->buf_hdr.segcount > 1 || pkt_hdr->ref_hdr != NULL;

> >>>  }

> >>>

> >>>  int odp_packet_num_segs(odp_packet_t pkt)

> >>>  {

> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>> +     uint32_t segcount = 0, i;

> >>> +     uint32_t seg_offset = 0, offset;

> >>> +

> >>> +     do {

> >>> +             segcount += pkt_hdr->buf_hdr.segcount - seg_offset;

> >>> +             offset    = pkt_hdr->ref_offset;

> >>> +             pkt_hdr   = pkt_hdr->ref_hdr;

> >>> +             if (pkt_hdr) {

> >>> +                     for (i = 0, seg_offset = 0;

> >>> +                          i < pkt_hdr->buf_hdr.segcount;

> >>> +                          i++, seg_offset++) {

> >>> +                             if (offset < pkt_hdr->buf_hdr.seg[i].len)

> >>> +                                     break;

> >>> +                             offset -= pkt_hdr->buf_hdr.seg[i].len;

> >>> +                     }

> >>> +             }

> >>> +     } while (pkt_hdr);

> >>>

> >>> -     return pkt_hdr->buf_hdr.segcount;

> >>> +     return segcount;

> >>>  }

> >>>

> >>> -odp_packet_seg_t odp_packet_first_seg(odp_packet_t pkt)

> >>> +odp_packet_seg_t odp_packet_first_seg(odp_packet_t pkt ODP_UNUSED)

> >>>  {

> >>> -     (void)pkt;

> >>> -

> >>>       return 0;

> >>>  }

> >>>

> >>>  odp_packet_seg_t odp_packet_last_seg(odp_packet_t pkt)

> >>>  {

> >>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>> -

> >>> -     return packet_last_seg(pkt_hdr);

> >>> +     return (odp_packet_seg_t)(odp_packet_num_segs(pkt) - 1);

> >>>  }

> >>>

> >>>  odp_packet_seg_t odp_packet_next_seg(odp_packet_t pkt,

> odp_packet_seg_t seg)

> >>>  {

> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>>

> >>> -     if (odp_unlikely(seg >= (odp_packet_seg_t)packet_last_

> seg(pkt_hdr)))

> >>> +     if (odp_unlikely(seg >= packet_last_seg(pkt_hdr)))

> >>>               return ODP_PACKET_SEG_INVALID;

> >>>

> >>>       return seg + 1;

> >>> @@ -1400,21 +1581,51 @@ odp_packet_seg_t odp_packet_next_seg(odp_packet_t

> pkt,

> >>> odp_packet_seg_t seg)

> >>>  void *odp_packet_seg_data(odp_packet_t pkt, odp_packet_seg_t seg)

> >>>  {

> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>> +     uint32_t seg_offset = 0, offset = 0, i;

> >>> +

> >>> +     while (seg >= pkt_hdr->buf_hdr.segcount - seg_offset &&

> >>> +            pkt_hdr->ref_hdr) {

> >>> +             seg    -= (pkt_hdr->buf_hdr.segcount - seg_offset);

> >>> +             offset  = pkt_hdr->ref_offset;

> >>> +             pkt_hdr = pkt_hdr->ref_hdr;

> >>> +             for (i = 0, seg_offset = 0;

> >>> +                  i < pkt_hdr->buf_hdr.segcount;

> >>> +                  i++, seg_offset++) {

> >>> +                     if (offset < pkt_hdr->buf_hdr.seg[i].len)

> >>> +                             break;

> >>> +                     offset -= pkt_hdr->buf_hdr.seg[i].len;

> >>> +             }

> >>> +     }

> >>>

> >>> -     if (odp_unlikely(seg >= pkt_hdr->buf_hdr.segcount))

> >>> +     if (odp_unlikely(seg + seg_offset >= pkt_hdr->buf_hdr.segcount))

> >>>               return NULL;

> >>>

> >>> -     return packet_seg_data(pkt_hdr, seg);

> >>> +     return packet_seg_data(pkt_hdr, seg + seg_offset) + offset;

> >>>  }

> >>>

> >>>  uint32_t odp_packet_seg_data_len(odp_packet_t pkt, odp_packet_seg_t

> seg)

> >>>  {

> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>> +     uint32_t seg_offset = 0, offset = 0, i;

> >>> +

> >>> +     while (seg >= pkt_hdr->buf_hdr.segcount - seg_offset &&

> >>> +            pkt_hdr->ref_hdr) {

> >>> +             seg    -= (pkt_hdr->buf_hdr.segcount - seg_offset);

> >>> +             offset  = pkt_hdr->ref_offset;

> >>> +             pkt_hdr = pkt_hdr->ref_hdr;

> >>> +             for (i = 0, seg_offset = 0;

> >>> +                  i < pkt_hdr->buf_hdr.segcount;

> >>> +                  i++, seg_offset++) {

> >>> +                     if (offset < pkt_hdr->buf_hdr.seg[i].len)

> >>> +                             break;

> >>> +                     offset -= pkt_hdr->buf_hdr.seg[i].len;

> >>> +             }

> >>> +     }

> >>>

> >>> -     if (odp_unlikely(seg >= pkt_hdr->buf_hdr.segcount))

> >>> +     if (odp_unlikely(seg + seg_offset >= pkt_hdr->buf_hdr.segcount))

> >>>               return 0;

> >>>

> >>> -     return packet_seg_len(pkt_hdr, seg);

> >>> +     return packet_seg_len(pkt_hdr, seg + seg_offset) - offset;

> >>>  }

> >>>

> >>>  /*

> >>> @@ -1428,12 +1639,14 @@ int odp_packet_add_data(odp_packet_t

> *pkt_ptr, uint32_t offset,

> >>> uint32_t len)

> >>>  {

> >>>       odp_packet_t pkt = *pkt_ptr;

> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>> -     uint32_t pktlen = pkt_hdr->frame_len;

> >>> +     uint32_t pktlen = packet_len(pkt_hdr);

> >>>       odp_packet_t newpkt;

> >>>

> >>>       if (offset > pktlen)

> >>>               return -1;

> >>>

> >>> +     ODP_ASSERT(odp_packet_unshared_len(*pkt_ptr) >= offset);

> >>> +

> >>>       newpkt = odp_packet_alloc(pkt_hdr->buf_hdr.pool_hdl, pktlen +

> len);

> >>>

> >>>       if (newpkt == ODP_PACKET_INVALID)

> >>> @@ -1496,6 +1709,8 @@ int odp_packet_align(odp_packet_t *pkt, uint32_t

> offset, uint32_t

> >>> len,

> >>>       if (align > ODP_CACHE_LINE_SIZE)

> >>>               return -1;

> >>>

> >>> +     ODP_ASSERT(odp_packet_has_ref(*pkt) == 0);

> >>> +

> >>>       if (seglen >= len) {

> >>>               misalign = align <= 1 ? 0 :

> >>>                       ODP_ALIGN_ROUNDUP(uaddr, align) - uaddr;

> >>> @@ -1535,10 +1750,13 @@ int odp_packet_concat(odp_packet_t *dst,

> odp_packet_t src)

> >>>       uint32_t dst_len    = dst_hdr->frame_len;

> >>>       uint32_t src_len    = src_hdr->frame_len;

> >>>

> >>> +     ODP_ASSERT(packet_ref_count(dst_hdr) == 1);

> >>> +

> >>>       /* Do a copy if resulting packet would be out of segments or

> packets

> >>> -      * are from different pools. */

> >>> +      * are from different pools or src is a reference. */

> >>>       if (odp_unlikely((dst_segs + src_segs) > CONFIG_PACKET_MAX_SEGS)

> ||

> >>> -         odp_unlikely(dst_pool != src_pool)) {

> >>> +         odp_unlikely(dst_pool != src_pool) ||

> >>> +         odp_unlikely(packet_ref_count(src_hdr)) > 1) {

> >>>               if (odp_packet_extend_tail(dst, src_len, NULL, NULL) >=

> 0) {

> >>>                       (void)odp_packet_copy_from_pkt(*dst, dst_len,

> >>>                                                      src, 0, src_len);

> >>> @@ -1553,8 +1771,9 @@ int odp_packet_concat(odp_packet_t *dst,

> odp_packet_t src)

> >>>

> >>>       add_all_segs(dst_hdr, src_hdr);

> >>>

> >>> -     dst_hdr->frame_len = dst_len + src_len;

> >>> -     dst_hdr->tailroom  = src_hdr->tailroom;

> >>> +     dst_hdr->frame_len    = dst_len + src_len;

> >>> +     dst_hdr->unshared_len = dst_len + src_len;

> >>> +     dst_hdr->tailroom     = src_hdr->tailroom;

> >>>

> >>>       /* Data was not moved in memory */

> >>>       return 0;

> >>> @@ -1567,6 +1786,7 @@ int odp_packet_split(odp_packet_t *pkt, uint32_t

> len, odp_packet_t

> >>> *tail)

> >>>       if (len >= pktlen || tail == NULL)

> >>>               return -1;

> >>>

> >>> +     ODP_ASSERT(odp_packet_unshared_len(*pkt) >= len);

> >>>       *tail = odp_packet_copy_part(*pkt, len, pktlen - len,

> >>>                                    odp_packet_pool(*pkt));

> >>>

> >>> @@ -1577,6 +1797,109 @@ int odp_packet_split(odp_packet_t *pkt,

> uint32_t len, odp_packet_t

> >>> *tail)

> >>>  }

> >>>

> >>>  /*

> >>> + * References

> >>> + */

> >>> +

> >>> +static inline void packet_ref(odp_packet_hdr_t *pkt_hdr)

> >>> +{

> >>> +     uint32_t i;

> >>> +     odp_packet_hdr_t *hdr;

> >>> +

> >>> +     do {

> >>> +             for (i = 0; i < pkt_hdr->buf_hdr.segcount; i++) {

> >>> +                     hdr = pkt_hdr->buf_hdr.seg[i].hdr;

> >>> +                     packet_ref_inc(hdr);

> >>> +             }

> >>> +

> >>> +             pkt_hdr = pkt_hdr->ref_hdr;

> >>> +     } while (pkt_hdr);

> >>> +}

> >>> +

> >>> +static inline odp_packet_t packet_splice(odp_packet_hdr_t *pkt_hdr,

> >>> +                                      uint32_t offset,

> >>> +                                      odp_packet_hdr_t *ref_hdr)

> >>> +{

> >>> +     /* Catch attempted references to stale handles in debug builds */

> >>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) > 0);

> >>> +

> >>> +     /* Splicing is from the last section of src pkt */

> >>> +     while (ref_hdr->ref_hdr)

> >>> +             ref_hdr = ref_hdr->ref_hdr;

> >>> +

> >>> +     /* Find section where splice begins */

> >>> +     while (offset >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

> >>> +             offset   -= (pkt_hdr->frame_len - pkt_hdr->ref_offset);

> >>> +             offset   += (pkt_hdr->ref_hdr->frame_len -

> pkt_hdr->ref_len);

> >>> +             pkt_hdr   = pkt_hdr->ref_hdr;

> >>> +     }

> >>> +

> >>> +     ref_hdr->ref_hdr    = pkt_hdr;

> >>> +     ref_hdr->ref_offset = offset;

> >>> +     ref_hdr->ref_len    = pkt_hdr->frame_len;

> >>> +

> >>> +     if (offset < pkt_hdr->unshared_len)

> >>> +             pkt_hdr->unshared_len = offset;

> >>> +

> >>> +     packet_ref(pkt_hdr);

> >>> +     return _odp_packet_hdl(ref_hdr);

> >>> +}

> >>> +

> >>> +odp_packet_t odp_packet_ref_static(odp_packet_t pkt)

> >>> +{

> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>> +

> >>> +     pkt_hdr->unshared_len = 0;

> >>> +     packet_ref(pkt_hdr);

> >>> +     return pkt;

> >>> +}

> >>> +

> >>> +odp_packet_t odp_packet_ref(odp_packet_t pkt, uint32_t offset)

> >>> +{

> >>> +     odp_packet_t hdr;

> >>> +     odp_packet_hdr_t *pkt_hdr;

> >>> +

> >>> +     if (pkt == ODP_PACKET_INVALID)

> >>> +             return ODP_PACKET_INVALID;

> >>> +

> >>> +     pkt_hdr = odp_packet_hdr(pkt);

> >>> +     if (offset >= packet_len(pkt_hdr))

> >>> +             return ODP_PACKET_INVALID;

> >>> +

> >>> +     hdr = odp_packet_alloc(odp_packet_pool(pkt), 0);

> >>> +

> >>> +     if (hdr == ODP_PACKET_INVALID)

> >>> +             return ODP_PACKET_INVALID;

> >>> +

> >>> +     return packet_splice(pkt_hdr, offset, odp_packet_hdr(hdr));

> >>> +}

> >>> +

> >>> +odp_packet_t odp_packet_ref_pkt(odp_packet_t pkt, uint32_t offset,

> >>> +                             odp_packet_t hdr)

> >>> +{

> >>> +     odp_packet_hdr_t *pkt_hdr;

> >>> +

> >>> +     if (pkt == ODP_PACKET_INVALID ||

> >>> +         hdr == ODP_PACKET_INVALID ||

> >>> +         pkt == hdr)

> >>> +             return ODP_PACKET_INVALID;

> >>> +

> >>> +     ODP_ASSERT(odp_packet_has_ref(hdr) == 0);

> >>> +

> >>> +     pkt_hdr = odp_packet_hdr(pkt);

> >>> +     if (offset >= packet_len(pkt_hdr))

> >>> +             return ODP_PACKET_INVALID;

> >>> +

> >>> +     return packet_splice(pkt_hdr, offset, odp_packet_hdr(hdr));

> >>> +}

> >>> +

> >>> +int odp_packet_has_ref(odp_packet_t pkt)

> >>> +{

> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>> +

> >>> +     return pkt_hdr->ref_hdr != NULL || packet_ref_count(pkt_hdr) > 1;

> >>> +}

> >>> +

> >>> +/*

> >>>   *

> >>>   * Copy

> >>>   * ********************************************************

> >>> @@ -1585,8 +1908,7 @@ int odp_packet_split(odp_packet_t *pkt, uint32_t

> len, odp_packet_t

> >>> *tail)

> >>>

> >>>  odp_packet_t odp_packet_copy(odp_packet_t pkt, odp_pool_t pool)

> >>>  {

> >>> -     odp_packet_hdr_t *srchdr = odp_packet_hdr(pkt);

> >>> -     uint32_t pktlen = srchdr->frame_len;

> >>> +     uint32_t pktlen = odp_packet_len(pkt);

> >>>       odp_packet_t newpkt = odp_packet_alloc(pool, pktlen);

> >>>

> >>>       if (newpkt != ODP_PACKET_INVALID) {

> >>> @@ -1625,7 +1947,7 @@ int odp_packet_copy_to_mem(odp_packet_t pkt,

> uint32_t offset,

> >>>       uint8_t *dstaddr = (uint8_t *)dst;

> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>>

> >>> -     if (offset + len > pkt_hdr->frame_len)

> >>> +     if (offset + len > packet_len(pkt_hdr))

> >>>               return -1;

> >>>

> >>>       while (len > 0) {

> >>> @@ -1649,9 +1971,11 @@ int odp_packet_copy_from_mem(odp_packet_t pkt,

> uint32_t offset,

> >>>       const uint8_t *srcaddr = (const uint8_t *)src;

> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>>

> >>> -     if (offset + len > pkt_hdr->frame_len)

> >>> +     if (offset + len > packet_len(pkt_hdr))

> >>>               return -1;

> >>>

> >>> +     ODP_ASSERT(odp_packet_unshared_len(pkt) >= offset + len);

> >>> +

> >>>       while (len > 0) {

> >>>               mapaddr = packet_map(pkt_hdr, offset, &seglen, NULL);

> >>>               cpylen = len > seglen ? seglen : len;

> >>> @@ -1677,10 +2001,12 @@ int odp_packet_copy_from_pkt(odp_packet_t

> dst, uint32_t

> >>> dst_offset,

> >>>       uint32_t src_seglen = 0; /* GCC */

> >>>       int overlap;

> >>>

> >>> -     if (dst_offset + len > dst_hdr->frame_len ||

> >>> -         src_offset + len > src_hdr->frame_len)

> >>> +     if (dst_offset + len > packet_len(dst_hdr) ||

> >>> +         src_offset + len > packet_len(src_hdr))

> >>>               return -1;

> >>>

> >>> +     ODP_ASSERT(odp_packet_unshared_len(dst) >= dst_offset + len);

> >>> +

> >>>       overlap = (dst_hdr == src_hdr &&

> >>>                  ((dst_offset <= src_offset &&

> >>>                    dst_offset + len >= src_offset) ||

> >>> @@ -1764,7 +2090,7 @@ void odp_packet_print(odp_packet_t pkt)

> >>>       len += snprintf(&str[len], n - len,

> >>>                       "  l4_offset    %" PRIu32 "\n",

> hdr->p.l4_offset);

> >>>       len += snprintf(&str[len], n - len,

> >>> -                     "  frame_len    %" PRIu32 "\n", hdr->frame_len);

> >>> +                     "  frame_len    %" PRIu32 "\n", packet_len(hdr));

> >>>       len += snprintf(&str[len], n - len,

> >>>                       "  input        %" PRIu64 "\n",

> >>>                       odp_pktio_to_u64(hdr->input));

> >>> --

> >>> 2.9.3

> >>

>




-- 
[image: Linaro] <http://www.linaro.org/>
François-Frédéric Ozog | *Director Linaro Networking Group*
T: +33.67221.6485
francois.ozog@linaro.org | Skype: ffozog
Bill Fischofer Feb. 18, 2017, 4:27 p.m. UTC | #5
On Sat, Feb 18, 2017 at 9:57 AM, Francois Ozog <francois.ozog@linaro.org>
wrote:

> Well, problem is still there.

> You are doing something on a packet that may not exist anymore.

>


Can you elaborate? The bug fix patch eliminates the race condition that
Janne pointed out because no thread manipulates a packet after decrementing
the ref_count other than to free it if that operation decremented the
ref_count to 0.


>

> On 17 February 2017 at 22:08, Bill Fischofer <bill.fischofer@linaro.org>

> wrote:

>

>> I've posted patch http://patches.opendataplane.org/patch/8155/ to

>> address this issue.  It goes on api-next on top of patches

>> http://patches.opendataplane.org/patch/7879/ and

>> http://patches.opendataplane.org/patch/8154/

>>

>> On Fri, Feb 17, 2017 at 2:39 PM, Bill Fischofer

>> <bill.fischofer@linaro.org> wrote:

>> > First off, thank you very much for this review.

>> >

>> > Please note that this code has been streamlined in patch

>> > http://patches.opendataplane.org/patch/7879/ and has been further

>> > refined with patch http://patches.opendataplane.org/patch/8145/ but

>> > the exposure you identify still exists in that code.

>> >

>> > On Fri, Feb 17, 2017 at 11:31 AM, Peltonen, Janne (Nokia - FI/Espoo)

>> > <janne.peltonen@nokia.com> wrote:

>> >> Hi,

>> >>

>> >> I took a look at the packet references and it seems to me that

>> >> either the implementation is a bit racy or I confused myself

>> >> when reading the code. Or maybe I got the intended concurrency

>> >> semantics of the packet references wrong?

>> >>

>> >> My first issue is that packet_free() may access freed packet

>> >> header or corrupt unshared_len.

>> >>

>> >> The packet free function looks like this:

>> >>

>> >> static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

>> >> {

>> >>         odp_packet_hdr_t *ref_hdr;

>> >>         uint32_t ref_count;

>> >>

>> >>         do {

>> >>                 ref_hdr = pkt_hdr->ref_hdr;

>> >>                 ref_count = packet_ref_count(pkt_hdr) - 1;

>> >>                 free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

>> >>

>> >>                 if (ref_count == 1)

>> >>                         pkt_hdr->unshared_len = pkt_hdr->frame_len;

>> >>

>> >>                 pkt_hdr = ref_hdr;

>> >>         } while (pkt_hdr);

>> >> }

>> >>

>> >> The problem here is that decrementing the ref_count, checking

>> >> its value and updating unshared_len is not single atomic

>> >> operation. By the time packet_free() checks if ref_count == 1

>> >> (i.e. if there is exactly one other reference left somewhere),

>> >> the true ref_count may have already been changed by another

>> >> thread doing packet_free() or packet_ref().

>> >>

>> >> For example, if two threads have a reference to the same packet

>> >> then execution (or the relevant memory ops) may get "interleaved"

>> >> as follows:

>> >>

>> >> T1: call packet_free()

>> >> T1: ref_count = packet_ref_count(pkt_hdr) - 1;

>> >> At this point ref_count variable is 1

>> >> T1: call free_bufs()

>> >> T1: call packet_ref_dec()

>> >> Now the ref_count of the packet header is 1.

>> >> T2: call and complete packet_free()

>> >> Thread 2 sees refcount 1 in the packet and frees the buffers

>> >> T1: pkt_hdr->unshared_len = pkt_hdr->frame_len;

>> >> Thread 1 accesses freed buffer for reading and writing.

>> >

>> > I agree. These steps should be reversed so that the code should read:

>> >

>> > if (ref_count == 1)

>> >    pkt_hdr->unshared_len = pkt_hdr->frame_len;

>> >

>> > free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

>> >

>> > Or using the code with the above two patches applied, the code should

>> read:

>> >

>> > static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

>> > {

>> >         odp_packet_hdr_t *ref_hdr;

>> >         uint32_t ref_count;

>> >         int num_seg;

>> >

>> >         do {

>> >                 ref_count = packet_ref_count(pkt_hdr);

>> >                 num_seg = pkt_hdr->buf_hdr.segcount;

>> >                 ref_hdr = pkt_hdr->ref_hdr;

>> >

>> >                 if (odp_likely((CONFIG_PACKET_MAX_SEGS == 1 || num_seg

>> == 1) &&

>> >                     ref_count == 1)) {

>> >                         buffer_free_multi((odp_buffer_t

>> > *)&pkt_hdr->buf_hdr.handle.handle, 1);

>> >                 } else {

>> >                         if (ref_count == 2)

>> >                                 pkt_hdr->unshared_len =

>> pkt_hdr->frame_len;

>> >

>> >                         free_bufs(pkt_hdr, 0, num_seg);

>> >                  }

>> >

>> >                  pkt_hdr = ref_hdr;

>> >         } while (pkt_hdr);

>> > }

>> >

>> > The mistake was trying to optimize things so that unshared_len is not

>> > set if the buffers are being freed, but that exposes these race

>> > conditions. So the worst that should now happen is that it is set

>> > unnecessarily before being freed.

>> >

>> > If you concur I'll fold this fix into a v3 for patch

>> > http://patches.opendataplane.org/patch/8145/

>> >

>> >>

>> >> Similarly, if T2 created a new reference, T1 would have

>> >> a wrong idea of the number of remaining references and

>> >> would adjust the unshared_len to an incorrect value.

>> >>

>> >> Right?

>> >>

>> >> Maybe other modifications of unshared_len are also racy.

>> >

>> > I don't believe so, because references do not change the existing ODP

>> > restriction that two threads cannot share the same odp_packet_t.  When

>> > a packet reference is created it returns a separate odp_packet_t that

>> > has its own metadata. So unshared_len is always private to an

>> > individual odp_packet_t. The exception is static references but in

>> > this case the entire

>> > packet along with its metadata must be treated as read only so

>> > operations like odp_packet_push_head() that would try to modify

>> > unshared_len are prohibited.

>> >

>> >>

>> >>

>> >>

>> >> The second issue is that the atomic ops for setting and

>> >> reading the ref count seem to have too relaxed memory

>> >> ordering. In particular, packet_ref_dec() must not happen

>> >> (be visible to other threads) before its caller is done

>> >> with the packet and the related memory accesses have

>> >> completed. Now there does not seem to be any optimization

>> >> and memory barrier to prevent the ref count decrementing

>> >> happening too early. So I think it is at least theoretically

>> >> possible that a thread e.g. reads from a packet buffer

>> >> after it has already been freed by another thread, somehow

>> >> like this:

>> >>

>> >> Source code order:

>> >> T1: interesting_data = read_from_pkt(pkt)

>> >> T1: packet_free(pkt)

>> >>

>> >> Order visible to T2:

>> >> 1: ref count decr

>> >> 2: read from pkt

>> >>

>> >> Now if T2 goes and frees the remaining reference between

>> >> steps 1 and 2, T1 may get even more interesting data.

>> >>

>> >> Right?

>> >

>> > I don't believe so. The semantics of odp_atomic_fetch_dec_u32(), which

>> > is what packet_ref_dec() uses, says that no two calls can see the same

>> > fetched value, so only one thread will return ref_count == 1 and free

>> > the buffer. Note that if I see ref_count == 1 no other thread can be

>> > trying to increment it via a concurrent odp_packet_ref() call because

>> > that would mean that two threads were trying to manipulate the same

>> > odp_packet_t, which is prohibited.

>> >

>> > For CPUs that support out of order instruction execution, this is only

>> > permitted providing the reordering and speculative executions are

>> > semantically consistent with sequential execution. If this were not

>> > the case you'd constantly have to worry about a processor turning

>> >

>> > T1: interesting_data = read_from_pkt(pkt)

>> > T1: packet_free(pkt)

>> >

>> > into

>> >

>> > T1: packet_free(pkt)

>> > T1: interesting_data = read_from_pkt(pkt)

>> >

>> > In your scenario above: T2 cannot be issuing a read to pkt after

>> > ref_count is decremented because the only way that T2 could be

>> > decrementing ref_count would be if T2 issued an odp_packet_free() call

>> > for it. Obviously if it tries to reference pkt after such a call that

>> > is an application error.

>> >

>> > Thanks again for your much-appreciated help in looking at this!

>> >

>> >>

>> >>         Janne

>> >>

>> >>

>> >>> -----Original Message-----

>> >>> From: lng-odp [mailto:lng-odp-bounces@lists.linaro.org] On Behalf Of

>> Bill Fischofer

>> >>> Sent: Wednesday, January 11, 2017 4:34 AM

>> >>> To: lng-odp@lists.linaro.org

>> >>> Subject: [lng-odp] [API-NEXT PATCHv7 2/5] linux-generic: packet:

>> implement reference apis

>> >>>

>> >>> Implement the APIs:

>> >>> - odp_packet_ref_static()

>> >>> - odp_packet_ref()

>> >>> - odp_packet_ref_pkt()

>> >>> - odp_packet_has_ref()

>> >>> - odp_packet_unshared_len()

>> >>>

>> >>> This also involves functional upgrades to the existing packet

>> manipulation

>> >>> APIs to work with packet references as input arguments.

>> >>>

>> >>> Signed-off-by: Bill Fischofer <bill.fischofer@linaro.org>

>> >>> ---

>> >>>  .../linux-generic/include/odp_packet_internal.h    |  87 +++-

>> >>>  platform/linux-generic/odp_packet.c                | 536

>> +++++++++++++++++----

>> >>>  2 files changed, 516 insertions(+), 107 deletions(-)

>> >>>

>> >>> diff --git a/platform/linux-generic/include/odp_packet_internal.h

>> b/platform/linux-

>> >>> generic/include/odp_packet_internal.h

>> >>> index e6e9d74..607560d 100644

>> >>> --- a/platform/linux-generic/include/odp_packet_internal.h

>> >>> +++ b/platform/linux-generic/include/odp_packet_internal.h

>> >>> @@ -19,6 +19,7 @@ extern "C" {

>> >>>

>> >>>  #include <odp/api/align.h>

>> >>>  #include <odp/api/debug.h>

>> >>> +#include <odp_debug_internal.h>

>> >>>  #include <odp_buffer_internal.h>

>> >>>  #include <odp_pool_internal.h>

>> >>>  #include <odp_buffer_inlines.h>

>> >>> @@ -168,7 +169,7 @@ typedef struct {

>> >>>   * packet_init(). Because of this any new fields added must be

>> reviewed for

>> >>>   * initialization requirements.

>> >>>   */

>> >>> -typedef struct {

>> >>> +typedef struct odp_packet_hdr_t {

>> >>>       /* common buffer header */

>> >>>       odp_buffer_hdr_t buf_hdr;

>> >>>

>> >>> @@ -184,6 +185,13 @@ typedef struct {

>> >>>       uint32_t headroom;

>> >>>       uint32_t tailroom;

>> >>>

>> >>> +     /* Fields used to support packet references */

>> >>> +     uint32_t unshared_len;

>> >>> +     struct odp_packet_hdr_t *ref_hdr;

>> >>> +     uint32_t ref_offset;

>> >>> +     uint32_t ref_len;

>> >>> +     odp_atomic_u32_t ref_count;

>> >>> +

>> >>>       /*

>> >>>        * Members below are not initialized by packet_init()

>> >>>        */

>> >>> @@ -212,6 +220,55 @@ static inline odp_packet_hdr_t

>> *odp_packet_hdr(odp_packet_t pkt)

>> >>>       return (odp_packet_hdr_t *)buf_hdl_to_hdr((odp_buffer_t)pkt);

>> >>>  }

>> >>>

>> >>> +static inline odp_packet_hdr_t *odp_packet_last_hdr(odp_packet_t

>> pkt,

>> >>> +                                                 uint32_t *offset)

>> >>> +{

>> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>> +     odp_packet_hdr_t *prev_hdr = pkt_hdr;

>> >>> +     uint32_t ref_offset = 0;

>> >>> +

>> >>> +     while (pkt_hdr->ref_hdr) {

>> >>> +             ref_offset = pkt_hdr->ref_offset;

>> >>> +             prev_hdr   = pkt_hdr;

>> >>> +             pkt_hdr    = pkt_hdr->ref_hdr;

>> >>> +     }

>> >>> +

>> >>> +     if (offset) {

>> >>> +             if (prev_hdr != pkt_hdr)

>> >>> +                     ref_offset += pkt_hdr->frame_len -

>> prev_hdr->ref_len;

>> >>> +             *offset = ref_offset;

>> >>> +     }

>> >>> +

>> >>> +     return pkt_hdr;

>> >>> +}

>> >>> +

>> >>> +static inline odp_packet_hdr_t *odp_packet_prev_hdr(odp_packet_hdr_t

>> *pkt_hdr,

>> >>> +                                                 odp_packet_hdr_t

>> *cur_hdr,

>> >>> +                                                 uint32_t *offset)

>> >>> +{

>> >>> +     uint32_t ref_offset = 0;

>> >>> +     odp_packet_hdr_t *prev_hdr = pkt_hdr;

>> >>> +

>> >>> +     while (pkt_hdr->ref_hdr != cur_hdr) {

>> >>> +             ref_offset = pkt_hdr->ref_offset;

>> >>> +             prev_hdr   = pkt_hdr;

>> >>> +             pkt_hdr    = pkt_hdr->ref_hdr;

>> >>> +     }

>> >>> +

>> >>> +     if (offset) {

>> >>> +             if (prev_hdr != pkt_hdr)

>> >>> +                     ref_offset += pkt_hdr->frame_len -

>> prev_hdr->ref_len;

>> >>> +             *offset = ref_offset;

>> >>> +     }

>> >>> +

>> >>> +     return pkt_hdr;

>> >>> +}

>> >>> +

>> >>> +static inline odp_packet_t _odp_packet_hdl(odp_packet_hdr_t

>> *pkt_hdr)

>> >>> +{

>> >>> +     return (odp_packet_t)odp_hdr_to_buf(&pkt_hdr->buf_hdr);

>> >>> +}

>> >>> +

>> >>>  static inline void copy_packet_parser_metadata(odp_packet_hdr_t

>> *src_hdr,

>> >>>                                              odp_packet_hdr_t

>> *dst_hdr)

>> >>>  {

>> >>> @@ -234,17 +291,43 @@ static inline void pull_tail(odp_packet_hdr_t

>> *pkt_hdr, uint32_t

>> >>> len)

>> >>>

>> >>>       pkt_hdr->tailroom  += len;

>> >>>       pkt_hdr->frame_len -= len;

>> >>> +     pkt_hdr->unshared_len -= len;

>> >>>       pkt_hdr->buf_hdr.seg[last].len -= len;

>> >>>  }

>> >>>

>> >>>  static inline uint32_t packet_len(odp_packet_hdr_t *pkt_hdr)

>> >>>  {

>> >>> -     return pkt_hdr->frame_len;

>> >>> +     uint32_t pkt_len = 0;

>> >>> +     uint32_t offset  = 0;

>> >>> +

>> >>> +     do {

>> >>> +             pkt_len += pkt_hdr->frame_len - offset;

>> >>> +             offset   = pkt_hdr->ref_offset;

>> >>> +             if (pkt_hdr->ref_hdr)

>> >>> +                     offset += (pkt_hdr->ref_hdr->frame_len -

>> >>> +                                pkt_hdr->ref_len);

>> >>> +             pkt_hdr  = pkt_hdr->ref_hdr;

>> >>> +     } while (pkt_hdr);

>> >>> +

>> >>> +     return pkt_len;

>> >>> +}

>> >>> +

>> >>> +static inline uint32_t packet_ref_count(odp_packet_hdr_t *pkt_hdr)

>> >>> +{

>> >>> +     return odp_atomic_load_u32(&pkt_hdr->ref_count);

>> >>> +}

>> >>> +

>> >>> +static inline void packet_ref_count_set(odp_packet_hdr_t *pkt_hdr,

>> uint32_t n)

>> >>> +{

>> >>> +     odp_atomic_init_u32(&pkt_hdr->ref_count, n);

>> >>>  }

>> >>>

>> >>>  static inline void packet_set_len(odp_packet_hdr_t *pkt_hdr,

>> uint32_t len)

>> >>>  {

>> >>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>> >>> +

>> >>>       pkt_hdr->frame_len = len;

>> >>> +     pkt_hdr->unshared_len = len;

>> >>>  }

>> >>>

>> >>>  static inline int packet_parse_l2_not_done(packet_parser_t *prs)

>> >>> diff --git a/platform/linux-generic/odp_packet.c

>> b/platform/linux-generic/odp_packet.c

>> >>> index f632a51..170965a 100644

>> >>> --- a/platform/linux-generic/odp_packet.c

>> >>> +++ b/platform/linux-generic/odp_packet.c

>> >>> @@ -33,13 +33,24 @@ static inline odp_buffer_t

>> buffer_handle(odp_packet_hdr_t *pkt_hdr)

>> >>>       return pkt_hdr->buf_hdr.handle.handle;

>> >>>  }

>> >>>

>> >>> +static inline uint32_t packet_ref_inc(odp_packet_hdr_t *pkt_hdr)

>> >>> +{

>> >>> +     return odp_atomic_fetch_inc_u32(&pkt_hdr->ref_count);

>> >>> +}

>> >>> +

>> >>> +static inline uint32_t packet_ref_dec(odp_packet_hdr_t *pkt_hdr)

>> >>> +{

>> >>> +     return odp_atomic_fetch_dec_u32(&pkt_hdr->ref_count);

>> >>> +}

>> >>> +

>> >>>  static inline uint32_t packet_seg_len(odp_packet_hdr_t *pkt_hdr,

>> >>>                                     uint32_t seg_idx)

>> >>>  {

>> >>>       return pkt_hdr->buf_hdr.seg[seg_idx].len;

>> >>>  }

>> >>>

>> >>> -static inline void *packet_seg_data(odp_packet_hdr_t *pkt_hdr,

>> uint32_t seg_idx)

>> >>> +static inline uint8_t *packet_seg_data(odp_packet_hdr_t *pkt_hdr,

>> >>> +                                    uint32_t seg_idx)

>> >>>  {

>> >>>       return pkt_hdr->buf_hdr.seg[seg_idx].data;

>> >>>  }

>> >>> @@ -52,6 +63,11 @@ static inline int packet_last_seg(odp_packet_hdr_t

>> *pkt_hdr)

>> >>>               return pkt_hdr->buf_hdr.segcount - 1;

>> >>>  }

>> >>>

>> >>> +static inline void *packet_data(odp_packet_hdr_t *pkt_hdr)

>> >>> +{

>> >>> +     return pkt_hdr->buf_hdr.seg[0].data;

>> >>> +}

>> >>> +

>> >>>  static inline uint32_t packet_first_seg_len(odp_packet_hdr_t

>> *pkt_hdr)

>> >>>  {

>> >>>       return packet_seg_len(pkt_hdr, 0);

>> >>> @@ -64,11 +80,6 @@ static inline uint32_t

>> packet_last_seg_len(odp_packet_hdr_t *pkt_hdr)

>> >>>       return packet_seg_len(pkt_hdr, last);

>> >>>  }

>> >>>

>> >>> -static inline void *packet_data(odp_packet_hdr_t *pkt_hdr)

>> >>> -{

>> >>> -     return pkt_hdr->buf_hdr.seg[0].data;

>> >>> -}

>> >>> -

>> >>>  static inline void *packet_tail(odp_packet_hdr_t *pkt_hdr)

>> >>>  {

>> >>>       int last = packet_last_seg(pkt_hdr);

>> >>> @@ -99,6 +110,7 @@ static inline void push_head(odp_packet_hdr_t

>> *pkt_hdr, uint32_t len)

>> >>>  {

>> >>>       pkt_hdr->headroom  -= len;

>> >>>       pkt_hdr->frame_len += len;

>> >>> +     pkt_hdr->unshared_len += len;

>> >>>       pkt_hdr->buf_hdr.seg[0].data -= len;

>> >>>       pkt_hdr->buf_hdr.seg[0].len  += len;

>> >>>  }

>> >>> @@ -107,6 +119,7 @@ static inline void pull_head(odp_packet_hdr_t

>> *pkt_hdr, uint32_t len)

>> >>>  {

>> >>>       pkt_hdr->headroom  += len;

>> >>>       pkt_hdr->frame_len -= len;

>> >>> +     pkt_hdr->unshared_len -= len;

>> >>>       pkt_hdr->buf_hdr.seg[0].data += len;

>> >>>       pkt_hdr->buf_hdr.seg[0].len  -= len;

>> >>>  }

>> >>> @@ -117,6 +130,7 @@ static inline void push_tail(odp_packet_hdr_t

>> *pkt_hdr, uint32_t len)

>> >>>

>> >>>       pkt_hdr->tailroom  -= len;

>> >>>       pkt_hdr->frame_len += len;

>> >>> +     pkt_hdr->unshared_len += len;

>> >>>       pkt_hdr->buf_hdr.seg[last].len += len;

>> >>>  }

>> >>>

>> >>> @@ -144,6 +158,10 @@ static inline void packet_seg_copy_md(odp_packet_hdr_t

>> *dst,

>> >>>       dst->buf_hdr.uarea_addr = src->buf_hdr.uarea_addr;

>> >>>       dst->buf_hdr.uarea_size = src->buf_hdr.uarea_size;

>> >>>

>> >>> +     /* reference related metadata */

>> >>> +     dst->ref_len      = src->ref_len;

>> >>> +     dst->unshared_len = src->unshared_len;

>> >>> +

>> >>>       /* segmentation data is not copied:

>> >>>        *   buf_hdr.seg[]

>> >>>        *   buf_hdr.segcount

>> >>> @@ -158,7 +176,15 @@ static inline void *packet_map(odp_packet_hdr_t

>> *pkt_hdr,

>> >>>       int seg = 0;

>> >>>       int seg_count = pkt_hdr->buf_hdr.segcount;

>> >>>

>> >>> -     if (odp_unlikely(offset >= pkt_hdr->frame_len))

>> >>> +     /* Special processing for references */

>> >>> +     while (offset >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

>> >>> +             offset   -= (pkt_hdr->frame_len - pkt_hdr->ref_offset);

>> >>> +             offset   += (pkt_hdr->ref_hdr->frame_len -

>> pkt_hdr->ref_len);

>> >>> +             pkt_hdr   = pkt_hdr->ref_hdr;

>> >>> +             seg_count = pkt_hdr->buf_hdr.segcount;

>> >>> +     }

>> >>> +

>> >>> +     if (odp_unlikely(offset > pkt_hdr->frame_len))

>> >>>               return NULL;

>> >>>

>> >>>       if (odp_likely(CONFIG_PACKET_MAX_SEGS == 1 || seg_count == 1))

>> {

>> >>> @@ -207,6 +233,9 @@ void packet_parse_reset(odp_packet_hdr_t

>> *pkt_hdr)

>> >>>       pkt_hdr->p.l2_offset        = 0;

>> >>>       pkt_hdr->p.l3_offset        = ODP_PACKET_OFFSET_INVALID;

>> >>>       pkt_hdr->p.l4_offset        = ODP_PACKET_OFFSET_INVALID;

>> >>> +

>> >>> +     /* Ensure dummy pkt_hdrs used in I/O recv classification are

>> valid */

>> >>> +     pkt_hdr->ref_hdr = NULL;

>> >>>  }

>> >>>

>> >>>  /**

>> >>> @@ -252,6 +281,10 @@ static inline void packet_init(odp_packet_hdr_t

>> *pkt_hdr, uint32_t

>> >>> len,

>> >>>                            CONFIG_PACKET_TAILROOM;

>> >>>

>> >>>       pkt_hdr->input = ODP_PKTIO_INVALID;

>> >>> +

>> >>> +     /* By default packet has no references */

>> >>> +     pkt_hdr->unshared_len = len;

>> >>> +     pkt_hdr->ref_hdr = NULL;

>> >>>  }

>> >>>

>> >>>  static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int

>> num)

>> >>> @@ -264,6 +297,7 @@ static inline void init_segments(odp_packet_hdr_t

>> *pkt_hdr[], int num)

>> >>>

>> >>>       hdr->buf_hdr.seg[0].data = hdr->buf_hdr.base_data;

>> >>>       hdr->buf_hdr.seg[0].len  = BASE_LEN;

>> >>> +     packet_ref_count_set(hdr, 1);

>> >>>

>> >>>       /* Link segments */

>> >>>       if (CONFIG_PACKET_MAX_SEGS != 1) {

>> >>> @@ -273,6 +307,7 @@ static inline void init_segments(odp_packet_hdr_t

>> *pkt_hdr[], int num)

>> >>>                       for (i = 1; i < num; i++) {

>> >>>                               odp_buffer_hdr_t *buf_hdr;

>> >>>

>> >>> +                             packet_ref_count_set(pkt_hdr[i], 1);

>> >>>                               buf_hdr = &pkt_hdr[i]->buf_hdr;

>> >>>                               hdr->buf_hdr.seg[i].hdr  = buf_hdr;

>> >>>                               hdr->buf_hdr.seg[i].data =

>> buf_hdr->base_data;

>> >>> @@ -376,9 +411,10 @@ static inline odp_packet_hdr_t

>> *add_segments(odp_packet_hdr_t

>> >>> *pkt_hdr,

>> >>>               new_hdr->buf_hdr.seg[0].len   = seg_len;

>> >>>

>> >>>               packet_seg_copy_md(new_hdr, pkt_hdr);

>> >>> -             new_hdr->frame_len = pkt_hdr->frame_len + len;

>> >>> -             new_hdr->headroom  = pool->headroom + offset;

>> >>> -             new_hdr->tailroom  = pkt_hdr->tailroom;

>> >>> +             new_hdr->frame_len    = pkt_hdr->frame_len + len;

>> >>> +             new_hdr->unshared_len = pkt_hdr->unshared_len + len;

>> >>> +             new_hdr->headroom     = pool->headroom + offset;

>> >>> +             new_hdr->tailroom     = pkt_hdr->tailroom;

>> >>>

>> >>>               pkt_hdr = new_hdr;

>> >>>       } else {

>> >>> @@ -391,8 +427,9 @@ static inline odp_packet_hdr_t

>> *add_segments(odp_packet_hdr_t

>> >>> *pkt_hdr,

>> >>>               last = packet_last_seg(pkt_hdr);

>> >>>               pkt_hdr->buf_hdr.seg[last].len = seg_len;

>> >>>

>> >>> -             pkt_hdr->frame_len += len;

>> >>> -             pkt_hdr->tailroom   = pool->tailroom + offset;

>> >>> +             pkt_hdr->frame_len    += len;

>> >>> +             pkt_hdr->unshared_len += len;

>> >>> +             pkt_hdr->tailroom      = pool->tailroom + offset;

>> >>>       }

>> >>>

>> >>>       return pkt_hdr;

>> >>> @@ -400,13 +437,18 @@ static inline odp_packet_hdr_t

>> *add_segments(odp_packet_hdr_t

>> >>> *pkt_hdr,

>> >>>

>> >>>  static inline void free_bufs(odp_packet_hdr_t *pkt_hdr, int first,

>> int num)

>> >>>  {

>> >>> -     int i;

>> >>> +     int i, nfree;

>> >>>       odp_buffer_t buf[num];

>> >>>

>> >>> -     for (i = 0; i < num; i++)

>> >>> -             buf[i] = buffer_handle(pkt_hdr->buf_hdr.seg[first +

>> i].hdr);

>> >>> +     for (i = 0, nfree = 0; i < num; i++) {

>> >>> +             odp_packet_hdr_t *hdr = pkt_hdr->buf_hdr.seg[first +

>> i].hdr;

>> >>> +

>> >>> +             if (packet_ref_dec(hdr) == 1)

>> >>> +                     buf[nfree++] = buffer_handle(hdr);

>> >>> +     }

>> >>>

>> >>> -     buffer_free_multi(buf, num);

>> >>> +     if (nfree > 0)

>> >>> +             buffer_free_multi(buf, nfree);

>> >>>  }

>> >>>

>> >>>  static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t

>> *pkt_hdr,

>> >>> @@ -417,11 +459,15 @@ static inline odp_packet_hdr_t

>> *free_segments(odp_packet_hdr_t

>> >>> *pkt_hdr,

>> >>>

>> >>>       if (head) {

>> >>>               odp_packet_hdr_t *new_hdr;

>> >>> -             int i;

>> >>> +             int i, nfree;

>> >>>               odp_buffer_t buf[num];

>> >>>

>> >>> -             for (i = 0; i < num; i++)

>> >>> -                     buf[i] = buffer_handle(pkt_hdr->buf_hdr

>> .seg[i].hdr);

>> >>> +             for (i = 0, nfree = 0; i < num; i++) {

>> >>> +                     new_hdr = pkt_hdr->buf_hdr.seg[i].hdr;

>> >>> +

>> >>> +                     if (packet_ref_dec(new_hdr) == 1)

>> >>> +                             buf[nfree++] = buffer_handle(new_hdr);

>> >>> +             }

>> >>>

>> >>>               /* First remaining segment is the new packet descriptor

>> */

>> >>>               new_hdr = pkt_hdr->buf_hdr.seg[num].hdr;

>> >>> @@ -430,15 +476,17 @@ static inline odp_packet_hdr_t

>> *free_segments(odp_packet_hdr_t

>> >>> *pkt_hdr,

>> >>>               packet_seg_copy_md(new_hdr, pkt_hdr);

>> >>>

>> >>>               /* Tailroom not changed */

>> >>> -             new_hdr->tailroom  = pkt_hdr->tailroom;

>> >>> -             new_hdr->headroom  = seg_headroom(new_hdr, 0);

>> >>> -             new_hdr->frame_len = pkt_hdr->frame_len - free_len;

>> >>> +             new_hdr->tailroom     = pkt_hdr->tailroom;

>> >>> +             new_hdr->headroom     = seg_headroom(new_hdr, 0);

>> >>> +             new_hdr->frame_len    = pkt_hdr->frame_len - free_len;

>> >>> +             new_hdr->unshared_len = pkt_hdr->unshared_len -

>> free_len;

>> >>>

>> >>>               pull_head(new_hdr, pull_len);

>> >>>

>> >>>               pkt_hdr = new_hdr;

>> >>>

>> >>> -             buffer_free_multi(buf, num);

>> >>> +             if (nfree > 0)

>> >>> +                     buffer_free_multi(buf, nfree);

>> >>>       } else {

>> >>>               /* Free last 'num' bufs */

>> >>>               free_bufs(pkt_hdr, num_remain, num);

>> >>> @@ -447,6 +495,7 @@ static inline odp_packet_hdr_t

>> *free_segments(odp_packet_hdr_t

>> >>> *pkt_hdr,

>> >>>                * of the metadata. */

>> >>>               pkt_hdr->buf_hdr.segcount = num_remain;

>> >>>               pkt_hdr->frame_len -= free_len;

>> >>> +             pkt_hdr->unshared_len -= free_len;

>> >>>               pkt_hdr->tailroom = seg_tailroom(pkt_hdr, num_remain -

>> 1);

>> >>>

>> >>>               pull_tail(pkt_hdr, pull_len);

>> >>> @@ -550,45 +599,34 @@ int odp_packet_alloc_multi(odp_pool_t

>> pool_hdl, uint32_t len,

>> >>>       return num;

>> >>>  }

>> >>>

>> >>> -void odp_packet_free(odp_packet_t pkt)

>> >>> +static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

>> >>>  {

>> >>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>> -     int num_seg = pkt_hdr->buf_hdr.segcount;

>> >>> +     odp_packet_hdr_t *ref_hdr;

>> >>> +     uint32_t ref_count;

>> >>>

>> >>> -     if (odp_likely(CONFIG_PACKET_MAX_SEGS == 1 || num_seg == 1))

>> >>> -             buffer_free_multi((odp_buffer_t *)&pkt, 1);

>> >>> -     else

>> >>> -             free_bufs(pkt_hdr, 0, num_seg);

>> >>> -}

>> >>> +     do {

>> >>> +             ref_hdr = pkt_hdr->ref_hdr;

>> >>> +             ref_count = packet_ref_count(pkt_hdr) - 1;

>> >>> +             free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

>> >>>

>> >>> -void odp_packet_free_multi(const odp_packet_t pkt[], int num)

>> >>> -{

>> >>> -     if (CONFIG_PACKET_MAX_SEGS == 1) {

>> >>> -             buffer_free_multi((const odp_buffer_t * const)pkt, num);

>> >>> -     } else {

>> >>> -             odp_buffer_t buf[num * CONFIG_PACKET_MAX_SEGS];

>> >>> -             int i, j;

>> >>> -             int bufs = 0;

>> >>> +             if (ref_count == 1)

>> >>> +                     pkt_hdr->unshared_len = pkt_hdr->frame_len;

>> >>>

>> >>> -             for (i = 0; i < num; i++) {

>> >>> -                     odp_packet_hdr_t *pkt_hdr =

>> odp_packet_hdr(pkt[i]);

>> >>> -                     int num_seg = pkt_hdr->buf_hdr.segcount;

>> >>> -                     odp_buffer_hdr_t *buf_hdr = &pkt_hdr->buf_hdr;

>> >>> -

>> >>> -                     buf[bufs] = (odp_buffer_t)pkt[i];

>> >>> -                     bufs++;

>> >>> +             pkt_hdr = ref_hdr;

>> >>> +     } while (pkt_hdr);

>> >>> +}

>> >>>

>> >>> -                     if (odp_likely(num_seg == 1))

>> >>> -                             continue;

>> >>> +void odp_packet_free(odp_packet_t pkt)

>> >>> +{

>> >>> +     packet_free(odp_packet_hdr(pkt));

>> >>> +}

>> >>>

>> >>> -                     for (j = 1; j < num_seg; j++) {

>> >>> -                             buf[bufs] =

>> buffer_handle(buf_hdr->seg[j].hdr);

>> >>> -                             bufs++;

>> >>> -                     }

>> >>> -             }

>> >>> +void odp_packet_free_multi(const odp_packet_t pkt[], int num)

>> >>> +{

>> >>> +     int i;

>> >>>

>> >>> -             buffer_free_multi(buf, bufs);

>> >>> -     }

>> >>> +     for (i = 0; i < num; i++)

>> >>> +             packet_free(odp_packet_hdr(pkt[i]));

>> >>>  }

>> >>>

>> >>>  int odp_packet_reset(odp_packet_t pkt, uint32_t len)

>> >>> @@ -599,6 +637,9 @@ int odp_packet_reset(odp_packet_t pkt, uint32_t

>> len)

>> >>>       if (len > pool->headroom + pool->data_size + pool->tailroom)

>> >>>               return -1;

>> >>>

>> >>> +     if (pkt_hdr->ref_hdr)

>> >>> +             packet_free(pkt_hdr->ref_hdr);

>> >>> +

>> >>>       packet_init(pkt_hdr, len, 0);

>> >>>

>> >>>       return 0;

>> >>> @@ -641,15 +682,21 @@ void *odp_packet_head(odp_packet_t pkt)

>> >>>  uint32_t odp_packet_buf_len(odp_packet_t pkt)

>> >>>  {

>> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>> +     uint32_t buf_len = 0;

>> >>>

>> >>> -     return pkt_hdr->buf_hdr.size * pkt_hdr->buf_hdr.segcount;

>> >>> +     do {

>> >>> +             buf_len += pkt_hdr->buf_hdr.size *

>> pkt_hdr->buf_hdr.segcount;

>> >>> +             pkt_hdr  = pkt_hdr->ref_hdr;

>> >>> +     } while (pkt_hdr);

>> >>> +

>> >>> +     return buf_len;

>> >>>  }

>> >>>

>> >>>  void *odp_packet_data(odp_packet_t pkt)

>> >>>  {

>> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>>

>> >>> -     return packet_data(pkt_hdr);

>> >>> +     return packet_map(pkt_hdr, 0, NULL, NULL);

>> >>>  }

>> >>>

>> >>>  uint32_t odp_packet_seg_len(odp_packet_t pkt)

>> >>> @@ -661,7 +708,32 @@ uint32_t odp_packet_seg_len(odp_packet_t pkt)

>> >>>

>> >>>  uint32_t odp_packet_len(odp_packet_t pkt)

>> >>>  {

>> >>> -     return odp_packet_hdr(pkt)->frame_len;

>> >>> +     return packet_len(odp_packet_hdr(pkt));

>> >>> +}

>> >>> +

>> >>> +uint32_t odp_packet_unshared_len(odp_packet_t pkt)

>> >>> +{

>> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>> +     uint32_t pkt_len = 0, offset = 0;

>> >>> +

>> >>> +     do {

>> >>> +             if (packet_ref_count(pkt_hdr) > 1) {

>> >>> +                     if (offset == 0)

>> >>> +                             pkt_len += pkt_hdr->unshared_len;

>> >>> +                     break;

>> >>> +             }

>> >>> +

>> >>> +             pkt_len += pkt_hdr->frame_len - offset;

>> >>> +             offset   = pkt_hdr->ref_offset;

>> >>> +

>> >>> +             if (pkt_hdr->ref_hdr)

>> >>> +                     offset += (pkt_hdr->ref_hdr->frame_len -

>> >>> +                                pkt_hdr->ref_len);

>> >>> +

>> >>> +             pkt_hdr = pkt_hdr->ref_hdr;

>> >>> +     } while (pkt_hdr);

>> >>> +

>> >>> +     return pkt_len;

>> >>>  }

>> >>>

>> >>>  uint32_t odp_packet_headroom(odp_packet_t pkt)

>> >>> @@ -671,12 +743,12 @@ uint32_t odp_packet_headroom(odp_packet_t pkt)

>> >>>

>> >>>  uint32_t odp_packet_tailroom(odp_packet_t pkt)

>> >>>  {

>> >>> -     return odp_packet_hdr(pkt)->tailroom;

>> >>> +     return odp_packet_last_hdr(pkt, NULL)->tailroom;

>> >>>  }

>> >>>

>> >>>  void *odp_packet_tail(odp_packet_t pkt)

>> >>>  {

>> >>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(pkt, NULL);

>> >>>

>> >>>       return packet_tail(pkt_hdr);

>> >>>  }

>> >>> @@ -870,7 +942,7 @@ int odp_packet_extend_head(odp_packet_t *pkt,

>> uint32_t len,

>> >>>  {

>> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

>> >>>       uint32_t frame_len = pkt_hdr->frame_len;

>> >>> -     uint32_t headroom  = pkt_hdr->headroom;

>> >>> +     uint32_t headroom = pkt_hdr->headroom;

>> >>>       int ret = 0;

>> >>>

>> >>>       if (len > headroom) {

>> >>> @@ -885,6 +957,46 @@ int odp_packet_extend_head(odp_packet_t *pkt,

>> uint32_t len,

>> >>>               segs = pkt_hdr->buf_hdr.segcount;

>> >>>

>> >>>               if (odp_unlikely((segs + num) >

>> CONFIG_PACKET_MAX_SEGS)) {

>> >>> +                     /* Handle recursively via references when

>> >>> +                      * working with referenced packets since another

>> >>> +                      * thread may be accessing it concurrently via

>> >>> +                      * its reference to it. */

>> >>> +                     if (packet_ref_count(pkt_hdr) > 1) {

>> >>> +                             odp_packet_t ref;

>> >>> +                             uint32_t unshared_len;

>> >>> +

>> >>> +                             push_head(pkt_hdr, headroom);

>> >>> +                             unshared_len = pkt_hdr->unshared_len;

>> >>> +                             ref = odp_packet_ref(*pkt, 0);

>> >>> +

>> >>> +                             if (ref == ODP_PACKET_INVALID) {

>> >>> +                                     pull_head(pkt_hdr, headroom);

>> >>> +                                     return -1;

>> >>> +                             }

>> >>> +

>> >>> +                             ret = odp_packet_extend_head(&ref,

>> >>> +                                                          len -

>> headroom,

>> >>> +                                                          data_ptr,

>> >>> +                                                          seg_len);

>> >>> +

>> >>> +                             if (ret < 0) {

>> >>> +                                     odp_packet_free(ref);

>> >>> +                                     pull_head(pkt_hdr, headroom);

>> >>> +                                     return -1;

>> >>> +                             }

>> >>> +

>> >>> +                             /* Since this is a special ref, the

>> >>> +                              * base pkt's unshared len is unchanged

>> */

>> >>> +                             pkt_hdr->unshared_len = unshared_len;

>> >>> +

>> >>> +                             /* Remove extra ref to the base pkt */

>> >>> +                             odp_packet_free(*pkt);

>> >>> +

>> >>> +                             /* Return the ref as the extension

>> result */

>> >>> +                             *pkt = ref;

>> >>> +                             return 1;

>> >>> +                     }

>> >>> +

>> >>>                       /* Cannot directly add new segments */

>> >>>                       odp_packet_hdr_t *new_hdr;

>> >>>                       int new_segs = 0;

>> >>> @@ -936,6 +1048,7 @@ int odp_packet_extend_head(odp_packet_t *pkt,

>> uint32_t len,

>> >>>

>> >>>                       pkt_hdr->buf_hdr.segcount = segs;

>> >>>                       pkt_hdr->frame_len        = frame_len;

>> >>> +                     pkt_hdr->unshared_len     = frame_len;

>> >>>                       pkt_hdr->headroom         = offset +

>> pool->headroom;

>> >>>                       pkt_hdr->tailroom         = pool->tailroom;

>> >>>

>> >>> @@ -961,11 +1074,16 @@ int odp_packet_extend_head(odp_packet_t *pkt,

>> uint32_t len,

>> >>>               push_head(pkt_hdr, len);

>> >>>       }

>> >>>

>> >>> -     if (data_ptr)

>> >>> -             *data_ptr = packet_data(pkt_hdr);

>> >>> +     if (data_ptr || seg_len) {

>> >>> +             uint32_t seg_ln = 0;

>> >>> +             void *data = packet_map(pkt_hdr, 0, &seg_ln, NULL);

>> >>>

>> >>> -     if (seg_len)

>> >>> -             *seg_len = packet_first_seg_len(pkt_hdr);

>> >>> +             if (data_ptr)

>> >>> +                     *data_ptr = data;

>> >>> +

>> >>> +             if (seg_len)

>> >>> +                     *seg_len = seg_ln;

>> >>> +     }

>> >>>

>> >>>       return ret;

>> >>>  }

>> >>> @@ -977,6 +1095,8 @@ void *odp_packet_pull_head(odp_packet_t pkt,

>> uint32_t len)

>> >>>       if (len > pkt_hdr->frame_len)

>> >>>               return NULL;

>> >>>

>> >>> +     ODP_ASSERT(len <= pkt_hdr->unshared_len);

>> >>> +

>> >>>       pull_head(pkt_hdr, len);

>> >>>       return packet_data(pkt_hdr);

>> >>>  }

>> >>> @@ -984,15 +1104,35 @@ void *odp_packet_pull_head(odp_packet_t pkt,

>> uint32_t len)

>> >>>  int odp_packet_trunc_head(odp_packet_t *pkt, uint32_t len,

>> >>>                         void **data_ptr, uint32_t *seg_len_out)

>> >>>  {

>> >>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

>> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt), *nxt_hdr;

>> >>>       uint32_t seg_len = packet_first_seg_len(pkt_hdr);

>> >>> +     int ret = 0;

>> >>>

>> >>> -     if (len > pkt_hdr->frame_len)

>> >>> +     if (len > packet_len(pkt_hdr))

>> >>>               return -1;

>> >>>

>> >>> -     if (len < seg_len) {

>> >>> +     ODP_ASSERT(len <= odp_packet_unshared_len(*pkt));

>> >>> +

>> >>> +     /* Special processing for references */

>> >>> +     while (len >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

>> >>> +             ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>> >>> +             nxt_hdr = pkt_hdr->ref_hdr;

>> >>> +             len -= pkt_hdr->frame_len;

>> >>> +             len += pkt_hdr->ref_offset +

>> >>> +                     (nxt_hdr->frame_len - pkt_hdr->ref_len);

>> >>> +             pkt_hdr->ref_hdr = NULL;

>> >>> +             packet_free(pkt_hdr);

>> >>> +             pkt_hdr = nxt_hdr;

>> >>> +             seg_len = packet_first_seg_len(pkt_hdr);

>> >>> +             *pkt = packet_handle(pkt_hdr);

>> >>> +             ret = 1;

>> >>> +     }

>> >>> +

>> >>> +     if (CONFIG_PACKET_MAX_SEGS == 1 ||

>> >>> +         len < seg_len ||

>> >>> +         pkt_hdr->buf_hdr.segcount == 1) {

>> >>>               pull_head(pkt_hdr, len);

>> >>> -     } else if (CONFIG_PACKET_MAX_SEGS != 1) {

>> >>> +     } else {

>> >>>               int num = 0;

>> >>>               uint32_t pull_len = 0;

>> >>>

>> >>> @@ -1007,23 +1147,29 @@ int odp_packet_trunc_head(odp_packet_t *pkt,

>> uint32_t len,

>> >>>               *pkt    = packet_handle(pkt_hdr);

>> >>>       }

>> >>>

>> >>> -     if (data_ptr)

>> >>> -             *data_ptr = packet_data(pkt_hdr);

>> >>> +     if (data_ptr || seg_len_out) {

>> >>> +             void *data_head = packet_map(pkt_hdr, 0, &seg_len,

>> NULL);

>> >>>

>> >>> -     if (seg_len_out)

>> >>> -             *seg_len_out = packet_first_seg_len(pkt_hdr);

>> >>> +             if (data_ptr)

>> >>> +                     *data_ptr = data_head;

>> >>>

>> >>> -     return 0;

>> >>> +             if (seg_len_out)

>> >>> +                     *seg_len_out = seg_len;

>> >>> +     }

>> >>> +

>> >>> +     return ret;

>> >>>  }

>> >>>

>> >>>  void *odp_packet_push_tail(odp_packet_t pkt, uint32_t len)

>> >>>  {

>> >>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(pkt, NULL);

>> >>>       void *old_tail;

>> >>>

>> >>>       if (len > pkt_hdr->tailroom)

>> >>>               return NULL;

>> >>>

>> >>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>> >>> +

>> >>>       old_tail = packet_tail(pkt_hdr);

>> >>>       push_tail(pkt_hdr, len);

>> >>>

>> >>> @@ -1033,12 +1179,14 @@ void *odp_packet_push_tail(odp_packet_t pkt,

>> uint32_t len)

>> >>>  int odp_packet_extend_tail(odp_packet_t *pkt, uint32_t len,

>> >>>                          void **data_ptr, uint32_t *seg_len_out)

>> >>>  {

>> >>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

>> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(*pkt, NULL);

>> >>>       uint32_t frame_len = pkt_hdr->frame_len;

>> >>>       uint32_t tailroom  = pkt_hdr->tailroom;

>> >>>       uint32_t tail_off  = frame_len;

>> >>>       int ret = 0;

>> >>>

>> >>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>> >>> +

>> >>>       if (len > tailroom) {

>> >>>               pool_t *pool = pool_entry_from_hdl(pkt_hdr->b

>> uf_hdr.pool_hdl);

>> >>>               int num;

>> >>> @@ -1129,6 +1277,7 @@ void *odp_packet_pull_tail(odp_packet_t pkt,

>> uint32_t len)

>> >>>       if (len > packet_last_seg_len(pkt_hdr))

>> >>>               return NULL;

>> >>>

>> >>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>> >>>       pull_tail(pkt_hdr, len);

>> >>>

>> >>>       return packet_tail(pkt_hdr);

>> >>> @@ -1139,17 +1288,34 @@ int odp_packet_trunc_tail(odp_packet_t *pkt,

>> uint32_t len,

>> >>>  {

>> >>>       int last;

>> >>>       uint32_t seg_len;

>> >>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

>> >>> +     uint32_t offset;

>> >>> +     odp_packet_hdr_t *first_hdr = odp_packet_hdr(*pkt);

>> >>> +     odp_packet_hdr_t *pkt_hdr, *prev_hdr;

>> >>>

>> >>> -     if (len > pkt_hdr->frame_len)

>> >>> +     if (len > packet_len(first_hdr))

>> >>>               return -1;

>> >>>

>> >>> +     pkt_hdr = odp_packet_last_hdr(*pkt, &offset);

>> >>> +

>> >>> +     /* Special processing for references */

>> >>> +     while (len >= pkt_hdr->frame_len - offset &&

>> first_hdr->ref_hdr) {

>> >>> +             len -= (pkt_hdr->frame_len - offset);

>> >>> +             prev_hdr = odp_packet_prev_hdr(first_hdr, pkt_hdr,

>> &offset);

>> >>> +             ODP_ASSERT(packet_ref_count(prev_hdr) == 1);

>> >>> +             prev_hdr->ref_hdr = NULL;

>> >>> +             packet_free(pkt_hdr);

>> >>> +             pkt_hdr = prev_hdr;

>> >>> +     }

>> >>> +

>> >>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

>> >>>       last    = packet_last_seg(pkt_hdr);

>> >>>       seg_len = packet_seg_len(pkt_hdr, last);

>> >>>

>> >>> -     if (len < seg_len) {

>> >>> +     if (CONFIG_PACKET_MAX_SEGS == 1 ||

>> >>> +         len < seg_len ||

>> >>> +         pkt_hdr->buf_hdr.segcount == 1) {

>> >>>               pull_tail(pkt_hdr, len);

>> >>> -     } else if (CONFIG_PACKET_MAX_SEGS != 1) {

>> >>> +     } else {

>> >>>               int num = 0;

>> >>>               uint32_t pull_len = 0;

>> >>>

>> >>> @@ -1356,35 +1522,50 @@ void odp_packet_ts_set(odp_packet_t pkt,

>> odp_time_t timestamp)

>> >>>

>> >>>  int odp_packet_is_segmented(odp_packet_t pkt)

>> >>>  {

>> >>> -     return odp_packet_hdr(pkt)->buf_hdr.segcount > 1;

>> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>> +

>> >>> +     return pkt_hdr->buf_hdr.segcount > 1 || pkt_hdr->ref_hdr !=

>> NULL;

>> >>>  }

>> >>>

>> >>>  int odp_packet_num_segs(odp_packet_t pkt)

>> >>>  {

>> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>> +     uint32_t segcount = 0, i;

>> >>> +     uint32_t seg_offset = 0, offset;

>> >>> +

>> >>> +     do {

>> >>> +             segcount += pkt_hdr->buf_hdr.segcount - seg_offset;

>> >>> +             offset    = pkt_hdr->ref_offset;

>> >>> +             pkt_hdr   = pkt_hdr->ref_hdr;

>> >>> +             if (pkt_hdr) {

>> >>> +                     for (i = 0, seg_offset = 0;

>> >>> +                          i < pkt_hdr->buf_hdr.segcount;

>> >>> +                          i++, seg_offset++) {

>> >>> +                             if (offset <

>> pkt_hdr->buf_hdr.seg[i].len)

>> >>> +                                     break;

>> >>> +                             offset -= pkt_hdr->buf_hdr.seg[i].len;

>> >>> +                     }

>> >>> +             }

>> >>> +     } while (pkt_hdr);

>> >>>

>> >>> -     return pkt_hdr->buf_hdr.segcount;

>> >>> +     return segcount;

>> >>>  }

>> >>>

>> >>> -odp_packet_seg_t odp_packet_first_seg(odp_packet_t pkt)

>> >>> +odp_packet_seg_t odp_packet_first_seg(odp_packet_t pkt ODP_UNUSED)

>> >>>  {

>> >>> -     (void)pkt;

>> >>> -

>> >>>       return 0;

>> >>>  }

>> >>>

>> >>>  odp_packet_seg_t odp_packet_last_seg(odp_packet_t pkt)

>> >>>  {

>> >>> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>> -

>> >>> -     return packet_last_seg(pkt_hdr);

>> >>> +     return (odp_packet_seg_t)(odp_packet_num_segs(pkt) - 1);

>> >>>  }

>> >>>

>> >>>  odp_packet_seg_t odp_packet_next_seg(odp_packet_t pkt,

>> odp_packet_seg_t seg)

>> >>>  {

>> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>>

>> >>> -     if (odp_unlikely(seg >= (odp_packet_seg_t)packet_last_

>> seg(pkt_hdr)))

>> >>> +     if (odp_unlikely(seg >= packet_last_seg(pkt_hdr)))

>> >>>               return ODP_PACKET_SEG_INVALID;

>> >>>

>> >>>       return seg + 1;

>> >>> @@ -1400,21 +1581,51 @@ odp_packet_seg_t

>> odp_packet_next_seg(odp_packet_t pkt,

>> >>> odp_packet_seg_t seg)

>> >>>  void *odp_packet_seg_data(odp_packet_t pkt, odp_packet_seg_t seg)

>> >>>  {

>> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>> +     uint32_t seg_offset = 0, offset = 0, i;

>> >>> +

>> >>> +     while (seg >= pkt_hdr->buf_hdr.segcount - seg_offset &&

>> >>> +            pkt_hdr->ref_hdr) {

>> >>> +             seg    -= (pkt_hdr->buf_hdr.segcount - seg_offset);

>> >>> +             offset  = pkt_hdr->ref_offset;

>> >>> +             pkt_hdr = pkt_hdr->ref_hdr;

>> >>> +             for (i = 0, seg_offset = 0;

>> >>> +                  i < pkt_hdr->buf_hdr.segcount;

>> >>> +                  i++, seg_offset++) {

>> >>> +                     if (offset < pkt_hdr->buf_hdr.seg[i].len)

>> >>> +                             break;

>> >>> +                     offset -= pkt_hdr->buf_hdr.seg[i].len;

>> >>> +             }

>> >>> +     }

>> >>>

>> >>> -     if (odp_unlikely(seg >= pkt_hdr->buf_hdr.segcount))

>> >>> +     if (odp_unlikely(seg + seg_offset >= pkt_hdr->buf_hdr.segcount))

>> >>>               return NULL;

>> >>>

>> >>> -     return packet_seg_data(pkt_hdr, seg);

>> >>> +     return packet_seg_data(pkt_hdr, seg + seg_offset) + offset;

>> >>>  }

>> >>>

>> >>>  uint32_t odp_packet_seg_data_len(odp_packet_t pkt, odp_packet_seg_t

>> seg)

>> >>>  {

>> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>> +     uint32_t seg_offset = 0, offset = 0, i;

>> >>> +

>> >>> +     while (seg >= pkt_hdr->buf_hdr.segcount - seg_offset &&

>> >>> +            pkt_hdr->ref_hdr) {

>> >>> +             seg    -= (pkt_hdr->buf_hdr.segcount - seg_offset);

>> >>> +             offset  = pkt_hdr->ref_offset;

>> >>> +             pkt_hdr = pkt_hdr->ref_hdr;

>> >>> +             for (i = 0, seg_offset = 0;

>> >>> +                  i < pkt_hdr->buf_hdr.segcount;

>> >>> +                  i++, seg_offset++) {

>> >>> +                     if (offset < pkt_hdr->buf_hdr.seg[i].len)

>> >>> +                             break;

>> >>> +                     offset -= pkt_hdr->buf_hdr.seg[i].len;

>> >>> +             }

>> >>> +     }

>> >>>

>> >>> -     if (odp_unlikely(seg >= pkt_hdr->buf_hdr.segcount))

>> >>> +     if (odp_unlikely(seg + seg_offset >= pkt_hdr->buf_hdr.segcount))

>> >>>               return 0;

>> >>>

>> >>> -     return packet_seg_len(pkt_hdr, seg);

>> >>> +     return packet_seg_len(pkt_hdr, seg + seg_offset) - offset;

>> >>>  }

>> >>>

>> >>>  /*

>> >>> @@ -1428,12 +1639,14 @@ int odp_packet_add_data(odp_packet_t

>> *pkt_ptr, uint32_t offset,

>> >>> uint32_t len)

>> >>>  {

>> >>>       odp_packet_t pkt = *pkt_ptr;

>> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>> -     uint32_t pktlen = pkt_hdr->frame_len;

>> >>> +     uint32_t pktlen = packet_len(pkt_hdr);

>> >>>       odp_packet_t newpkt;

>> >>>

>> >>>       if (offset > pktlen)

>> >>>               return -1;

>> >>>

>> >>> +     ODP_ASSERT(odp_packet_unshared_len(*pkt_ptr) >= offset);

>> >>> +

>> >>>       newpkt = odp_packet_alloc(pkt_hdr->buf_hdr.pool_hdl, pktlen +

>> len);

>> >>>

>> >>>       if (newpkt == ODP_PACKET_INVALID)

>> >>> @@ -1496,6 +1709,8 @@ int odp_packet_align(odp_packet_t *pkt,

>> uint32_t offset, uint32_t

>> >>> len,

>> >>>       if (align > ODP_CACHE_LINE_SIZE)

>> >>>               return -1;

>> >>>

>> >>> +     ODP_ASSERT(odp_packet_has_ref(*pkt) == 0);

>> >>> +

>> >>>       if (seglen >= len) {

>> >>>               misalign = align <= 1 ? 0 :

>> >>>                       ODP_ALIGN_ROUNDUP(uaddr, align) - uaddr;

>> >>> @@ -1535,10 +1750,13 @@ int odp_packet_concat(odp_packet_t *dst,

>> odp_packet_t src)

>> >>>       uint32_t dst_len    = dst_hdr->frame_len;

>> >>>       uint32_t src_len    = src_hdr->frame_len;

>> >>>

>> >>> +     ODP_ASSERT(packet_ref_count(dst_hdr) == 1);

>> >>> +

>> >>>       /* Do a copy if resulting packet would be out of segments or

>> packets

>> >>> -      * are from different pools. */

>> >>> +      * are from different pools or src is a reference. */

>> >>>       if (odp_unlikely((dst_segs + src_segs) >

>> CONFIG_PACKET_MAX_SEGS) ||

>> >>> -         odp_unlikely(dst_pool != src_pool)) {

>> >>> +         odp_unlikely(dst_pool != src_pool) ||

>> >>> +         odp_unlikely(packet_ref_count(src_hdr)) > 1) {

>> >>>               if (odp_packet_extend_tail(dst, src_len, NULL, NULL) >=

>> 0) {

>> >>>                       (void)odp_packet_copy_from_pkt(*dst, dst_len,

>> >>>                                                      src, 0, src_len);

>> >>> @@ -1553,8 +1771,9 @@ int odp_packet_concat(odp_packet_t *dst,

>> odp_packet_t src)

>> >>>

>> >>>       add_all_segs(dst_hdr, src_hdr);

>> >>>

>> >>> -     dst_hdr->frame_len = dst_len + src_len;

>> >>> -     dst_hdr->tailroom  = src_hdr->tailroom;

>> >>> +     dst_hdr->frame_len    = dst_len + src_len;

>> >>> +     dst_hdr->unshared_len = dst_len + src_len;

>> >>> +     dst_hdr->tailroom     = src_hdr->tailroom;

>> >>>

>> >>>       /* Data was not moved in memory */

>> >>>       return 0;

>> >>> @@ -1567,6 +1786,7 @@ int odp_packet_split(odp_packet_t *pkt,

>> uint32_t len, odp_packet_t

>> >>> *tail)

>> >>>       if (len >= pktlen || tail == NULL)

>> >>>               return -1;

>> >>>

>> >>> +     ODP_ASSERT(odp_packet_unshared_len(*pkt) >= len);

>> >>>       *tail = odp_packet_copy_part(*pkt, len, pktlen - len,

>> >>>                                    odp_packet_pool(*pkt));

>> >>>

>> >>> @@ -1577,6 +1797,109 @@ int odp_packet_split(odp_packet_t *pkt,

>> uint32_t len, odp_packet_t

>> >>> *tail)

>> >>>  }

>> >>>

>> >>>  /*

>> >>> + * References

>> >>> + */

>> >>> +

>> >>> +static inline void packet_ref(odp_packet_hdr_t *pkt_hdr)

>> >>> +{

>> >>> +     uint32_t i;

>> >>> +     odp_packet_hdr_t *hdr;

>> >>> +

>> >>> +     do {

>> >>> +             for (i = 0; i < pkt_hdr->buf_hdr.segcount; i++) {

>> >>> +                     hdr = pkt_hdr->buf_hdr.seg[i].hdr;

>> >>> +                     packet_ref_inc(hdr);

>> >>> +             }

>> >>> +

>> >>> +             pkt_hdr = pkt_hdr->ref_hdr;

>> >>> +     } while (pkt_hdr);

>> >>> +}

>> >>> +

>> >>> +static inline odp_packet_t packet_splice(odp_packet_hdr_t *pkt_hdr,

>> >>> +                                      uint32_t offset,

>> >>> +                                      odp_packet_hdr_t *ref_hdr)

>> >>> +{

>> >>> +     /* Catch attempted references to stale handles in debug builds

>> */

>> >>> +     ODP_ASSERT(packet_ref_count(pkt_hdr) > 0);

>> >>> +

>> >>> +     /* Splicing is from the last section of src pkt */

>> >>> +     while (ref_hdr->ref_hdr)

>> >>> +             ref_hdr = ref_hdr->ref_hdr;

>> >>> +

>> >>> +     /* Find section where splice begins */

>> >>> +     while (offset >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

>> >>> +             offset   -= (pkt_hdr->frame_len - pkt_hdr->ref_offset);

>> >>> +             offset   += (pkt_hdr->ref_hdr->frame_len -

>> pkt_hdr->ref_len);

>> >>> +             pkt_hdr   = pkt_hdr->ref_hdr;

>> >>> +     }

>> >>> +

>> >>> +     ref_hdr->ref_hdr    = pkt_hdr;

>> >>> +     ref_hdr->ref_offset = offset;

>> >>> +     ref_hdr->ref_len    = pkt_hdr->frame_len;

>> >>> +

>> >>> +     if (offset < pkt_hdr->unshared_len)

>> >>> +             pkt_hdr->unshared_len = offset;

>> >>> +

>> >>> +     packet_ref(pkt_hdr);

>> >>> +     return _odp_packet_hdl(ref_hdr);

>> >>> +}

>> >>> +

>> >>> +odp_packet_t odp_packet_ref_static(odp_packet_t pkt)

>> >>> +{

>> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>> +

>> >>> +     pkt_hdr->unshared_len = 0;

>> >>> +     packet_ref(pkt_hdr);

>> >>> +     return pkt;

>> >>> +}

>> >>> +

>> >>> +odp_packet_t odp_packet_ref(odp_packet_t pkt, uint32_t offset)

>> >>> +{

>> >>> +     odp_packet_t hdr;

>> >>> +     odp_packet_hdr_t *pkt_hdr;

>> >>> +

>> >>> +     if (pkt == ODP_PACKET_INVALID)

>> >>> +             return ODP_PACKET_INVALID;

>> >>> +

>> >>> +     pkt_hdr = odp_packet_hdr(pkt);

>> >>> +     if (offset >= packet_len(pkt_hdr))

>> >>> +             return ODP_PACKET_INVALID;

>> >>> +

>> >>> +     hdr = odp_packet_alloc(odp_packet_pool(pkt), 0);

>> >>> +

>> >>> +     if (hdr == ODP_PACKET_INVALID)

>> >>> +             return ODP_PACKET_INVALID;

>> >>> +

>> >>> +     return packet_splice(pkt_hdr, offset, odp_packet_hdr(hdr));

>> >>> +}

>> >>> +

>> >>> +odp_packet_t odp_packet_ref_pkt(odp_packet_t pkt, uint32_t offset,

>> >>> +                             odp_packet_t hdr)

>> >>> +{

>> >>> +     odp_packet_hdr_t *pkt_hdr;

>> >>> +

>> >>> +     if (pkt == ODP_PACKET_INVALID ||

>> >>> +         hdr == ODP_PACKET_INVALID ||

>> >>> +         pkt == hdr)

>> >>> +             return ODP_PACKET_INVALID;

>> >>> +

>> >>> +     ODP_ASSERT(odp_packet_has_ref(hdr) == 0);

>> >>> +

>> >>> +     pkt_hdr = odp_packet_hdr(pkt);

>> >>> +     if (offset >= packet_len(pkt_hdr))

>> >>> +             return ODP_PACKET_INVALID;

>> >>> +

>> >>> +     return packet_splice(pkt_hdr, offset, odp_packet_hdr(hdr));

>> >>> +}

>> >>> +

>> >>> +int odp_packet_has_ref(odp_packet_t pkt)

>> >>> +{

>> >>> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>> +

>> >>> +     return pkt_hdr->ref_hdr != NULL || packet_ref_count(pkt_hdr) >

>> 1;

>> >>> +}

>> >>> +

>> >>> +/*

>> >>>   *

>> >>>   * Copy

>> >>>   * ********************************************************

>> >>> @@ -1585,8 +1908,7 @@ int odp_packet_split(odp_packet_t *pkt,

>> uint32_t len, odp_packet_t

>> >>> *tail)

>> >>>

>> >>>  odp_packet_t odp_packet_copy(odp_packet_t pkt, odp_pool_t pool)

>> >>>  {

>> >>> -     odp_packet_hdr_t *srchdr = odp_packet_hdr(pkt);

>> >>> -     uint32_t pktlen = srchdr->frame_len;

>> >>> +     uint32_t pktlen = odp_packet_len(pkt);

>> >>>       odp_packet_t newpkt = odp_packet_alloc(pool, pktlen);

>> >>>

>> >>>       if (newpkt != ODP_PACKET_INVALID) {

>> >>> @@ -1625,7 +1947,7 @@ int odp_packet_copy_to_mem(odp_packet_t pkt,

>> uint32_t offset,

>> >>>       uint8_t *dstaddr = (uint8_t *)dst;

>> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>>

>> >>> -     if (offset + len > pkt_hdr->frame_len)

>> >>> +     if (offset + len > packet_len(pkt_hdr))

>> >>>               return -1;

>> >>>

>> >>>       while (len > 0) {

>> >>> @@ -1649,9 +1971,11 @@ int odp_packet_copy_from_mem(odp_packet_t

>> pkt, uint32_t offset,

>> >>>       const uint8_t *srcaddr = (const uint8_t *)src;

>> >>>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

>> >>>

>> >>> -     if (offset + len > pkt_hdr->frame_len)

>> >>> +     if (offset + len > packet_len(pkt_hdr))

>> >>>               return -1;

>> >>>

>> >>> +     ODP_ASSERT(odp_packet_unshared_len(pkt) >= offset + len);

>> >>> +

>> >>>       while (len > 0) {

>> >>>               mapaddr = packet_map(pkt_hdr, offset, &seglen, NULL);

>> >>>               cpylen = len > seglen ? seglen : len;

>> >>> @@ -1677,10 +2001,12 @@ int odp_packet_copy_from_pkt(odp_packet_t

>> dst, uint32_t

>> >>> dst_offset,

>> >>>       uint32_t src_seglen = 0; /* GCC */

>> >>>       int overlap;

>> >>>

>> >>> -     if (dst_offset + len > dst_hdr->frame_len ||

>> >>> -         src_offset + len > src_hdr->frame_len)

>> >>> +     if (dst_offset + len > packet_len(dst_hdr) ||

>> >>> +         src_offset + len > packet_len(src_hdr))

>> >>>               return -1;

>> >>>

>> >>> +     ODP_ASSERT(odp_packet_unshared_len(dst) >= dst_offset + len);

>> >>> +

>> >>>       overlap = (dst_hdr == src_hdr &&

>> >>>                  ((dst_offset <= src_offset &&

>> >>>                    dst_offset + len >= src_offset) ||

>> >>> @@ -1764,7 +2090,7 @@ void odp_packet_print(odp_packet_t pkt)

>> >>>       len += snprintf(&str[len], n - len,

>> >>>                       "  l4_offset    %" PRIu32 "\n",

>> hdr->p.l4_offset);

>> >>>       len += snprintf(&str[len], n - len,

>> >>> -                     "  frame_len    %" PRIu32 "\n", hdr->frame_len);

>> >>> +                     "  frame_len    %" PRIu32 "\n",

>> packet_len(hdr));

>> >>>       len += snprintf(&str[len], n - len,

>> >>>                       "  input        %" PRIu64 "\n",

>> >>>                       odp_pktio_to_u64(hdr->input));

>> >>> --

>> >>> 2.9.3

>> >>

>>

>

>

>

> --

> [image: Linaro] <http://www.linaro.org/>

> François-Frédéric Ozog | *Director Linaro Networking Group*

> T: +33.67221.6485

> francois.ozog@linaro.org | Skype: ffozog

>

>
Savolainen, Petri (Nokia - FI/Espoo) Feb. 20, 2017, 3:01 p.m. UTC | #6
Hi,

We are already in the phase where code in master should be maintained for production quality. There is no hurry to merge in code that has questionable quality. Zero copy packet references is not a trivial feature to implement. We are in much better position to review, test and use the reference code, if it's developed in phases. That's why I propose that we do multiple smaller steps:

1) merge simple, copy based implementation first (in api-next and master), which we can be sure that is does not break anything
2) write multi-threaded (performance) test apps for refs
3) cleanup, optimize normal packet code towards zero-copy multi-ref support (minimize places where refs are visible)
4) implement zero-copy multi-ref for most obvious use cases: maybe static references first ...
5) continue multi-ref implementation, or decide that some rare corner cases can be left with copy based implementation

This actually follows what has been done before. Add simple, copy based implementation first and continue development/optimization from there. So, that we can step back and compare easily with previous version, if e.g. race conditions are found.

-Petri


> -----Original Message-----

> From: lng-odp [mailto:lng-odp-bounces@lists.linaro.org] On Behalf Of Bill

> Fischofer

> Sent: Saturday, February 18, 2017 6:28 PM

> To: Francois Ozog <francois.ozog@linaro.org>

> Cc: lng-odp@lists.linaro.org

> Subject: Re: [lng-odp] [API-NEXT PATCHv7 2/5] linux-generic: packet:

> implement reference apis

> 

> On Sat, Feb 18, 2017 at 9:57 AM, Francois Ozog <francois.ozog@linaro.org>

> wrote:

> 

> > Well, problem is still there.

> > You are doing something on a packet that may not exist anymore.

> >

> 

> Can you elaborate? The bug fix patch eliminates the race condition that

> Janne pointed out because no thread manipulates a packet after

> decrementing

> the ref_count other than to free it if that operation decremented the

> ref_count to 0.

> 

> 

> >

> > On 17 February 2017 at 22:08, Bill Fischofer <bill.fischofer@linaro.org>

> > wrote:

> >

> >> I've posted patch http://patches.opendataplane.org/patch/8155/ to

> >> address this issue.  It goes on api-next on top of patches

> >> http://patches.opendataplane.org/patch/7879/ and

> >> http://patches.opendataplane.org/patch/8154/

> >>

> >> On Fri, Feb 17, 2017 at 2:39 PM, Bill Fischofer

> >> <bill.fischofer@linaro.org> wrote:

> >> > First off, thank you very much for this review.

> >> >

> >> > Please note that this code has been streamlined in patch

> >> > http://patches.opendataplane.org/patch/7879/ and has been further

> >> > refined with patch http://patches.opendataplane.org/patch/8145/ but

> >> > the exposure you identify still exists in that code.

> >> >

> >> > On Fri, Feb 17, 2017 at 11:31 AM, Peltonen, Janne (Nokia - FI/Espoo)

> >> > <janne.peltonen@nokia.com> wrote:

> >> >> Hi,

> >> >>

> >> >> I took a look at the packet references and it seems to me that

> >> >> either the implementation is a bit racy or I confused myself

> >> >> when reading the code. Or maybe I got the intended concurrency

> >> >> semantics of the packet references wrong?

> >> >>

> >> >> My first issue is that packet_free() may access freed packet

> >> >> header or corrupt unshared_len.

> >> >>

> >> >> The packet free function looks like this:

> >> >>

> >> >> static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

> >> >> {

> >> >>         odp_packet_hdr_t *ref_hdr;

> >> >>         uint32_t ref_count;

> >> >>

> >> >>         do {

> >> >>                 ref_hdr = pkt_hdr->ref_hdr;

> >> >>                 ref_count = packet_ref_count(pkt_hdr) - 1;

> >> >>                 free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

> >> >>

> >> >>                 if (ref_count == 1)

> >> >>                         pkt_hdr->unshared_len = pkt_hdr->frame_len;

> >> >>

> >> >>                 pkt_hdr = ref_hdr;

> >> >>         } while (pkt_hdr);

> >> >> }

> >> >>

> >> >> The problem here is that decrementing the ref_count, checking

> >> >> its value and updating unshared_len is not single atomic

> >> >> operation. By the time packet_free() checks if ref_count == 1

> >> >> (i.e. if there is exactly one other reference left somewhere),

> >> >> the true ref_count may have already been changed by another

> >> >> thread doing packet_free() or packet_ref().

> >> >>

> >> >> For example, if two threads have a reference to the same packet

> >> >> then execution (or the relevant memory ops) may get "interleaved"

> >> >> as follows:

> >> >>

> >> >> T1: call packet_free()

> >> >> T1: ref_count = packet_ref_count(pkt_hdr) - 1;

> >> >> At this point ref_count variable is 1

> >> >> T1: call free_bufs()

> >> >> T1: call packet_ref_dec()

> >> >> Now the ref_count of the packet header is 1.

> >> >> T2: call and complete packet_free()

> >> >> Thread 2 sees refcount 1 in the packet and frees the buffers

> >> >> T1: pkt_hdr->unshared_len = pkt_hdr->frame_len;

> >> >> Thread 1 accesses freed buffer for reading and writing.

> >> >

> >> > I agree. These steps should be reversed so that the code should read:

> >> >

> >> > if (ref_count == 1)

> >> >    pkt_hdr->unshared_len = pkt_hdr->frame_len;

> >> >

> >> > free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

> >> >

> >> > Or using the code with the above two patches applied, the code should

> >> read:

> >> >

> >> > static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

> >> > {

> >> >         odp_packet_hdr_t *ref_hdr;

> >> >         uint32_t ref_count;

> >> >         int num_seg;

> >> >

> >> >         do {

> >> >                 ref_count = packet_ref_count(pkt_hdr);

> >> >                 num_seg = pkt_hdr->buf_hdr.segcount;

> >> >                 ref_hdr = pkt_hdr->ref_hdr;

> >> >

> >> >                 if (odp_likely((CONFIG_PACKET_MAX_SEGS == 1 ||

> num_seg

> >> == 1) &&

> >> >                     ref_count == 1)) {

> >> >                         buffer_free_multi((odp_buffer_t

> >> > *)&pkt_hdr->buf_hdr.handle.handle, 1);

> >> >                 } else {

> >> >                         if (ref_count == 2)

> >> >                                 pkt_hdr->unshared_len =

> >> pkt_hdr->frame_len;

> >> >

> >> >                         free_bufs(pkt_hdr, 0, num_seg);

> >> >                  }

> >> >

> >> >                  pkt_hdr = ref_hdr;

> >> >         } while (pkt_hdr);

> >> > }

> >> >

> >> > The mistake was trying to optimize things so that unshared_len is not

> >> > set if the buffers are being freed, but that exposes these race

> >> > conditions. So the worst that should now happen is that it is set

> >> > unnecessarily before being freed.

> >> >

> >> > If you concur I'll fold this fix into a v3 for patch

> >> > http://patches.opendataplane.org/patch/8145/

> >> >

> >> >>

> >> >> Similarly, if T2 created a new reference, T1 would have

> >> >> a wrong idea of the number of remaining references and

> >> >> would adjust the unshared_len to an incorrect value.

> >> >>

> >> >> Right?

> >> >>

> >> >> Maybe other modifications of unshared_len are also racy.

> >> >

> >> > I don't believe so, because references do not change the existing ODP

> >> > restriction that two threads cannot share the same odp_packet_t.

> When

> >> > a packet reference is created it returns a separate odp_packet_t that

> >> > has its own metadata. So unshared_len is always private to an

> >> > individual odp_packet_t. The exception is static references but in

> >> > this case the entire

> >> > packet along with its metadata must be treated as read only so

> >> > operations like odp_packet_push_head() that would try to modify

> >> > unshared_len are prohibited.

> >> >

> >> >>

> >> >>

> >> >>

> >> >> The second issue is that the atomic ops for setting and

> >> >> reading the ref count seem to have too relaxed memory

> >> >> ordering. In particular, packet_ref_dec() must not happen

> >> >> (be visible to other threads) before its caller is done

> >> >> with the packet and the related memory accesses have

> >> >> completed. Now there does not seem to be any optimization

> >> >> and memory barrier to prevent the ref count decrementing

> >> >> happening too early. So I think it is at least theoretically

> >> >> possible that a thread e.g. reads from a packet buffer

> >> >> after it has already been freed by another thread, somehow

> >> >> like this:

> >> >>

> >> >> Source code order:

> >> >> T1: interesting_data = read_from_pkt(pkt)

> >> >> T1: packet_free(pkt)

> >> >>

> >> >> Order visible to T2:

> >> >> 1: ref count decr

> >> >> 2: read from pkt

> >> >>

> >> >> Now if T2 goes and frees the remaining reference between

> >> >> steps 1 and 2, T1 may get even more interesting data.

> >> >>

> >> >> Right?

> >> >

> >> > I don't believe so. The semantics of odp_atomic_fetch_dec_u32(),

> which

> >> > is what packet_ref_dec() uses, says that no two calls can see the

> same

> >> > fetched value, so only one thread will return ref_count == 1 and free

> >> > the buffer. Note that if I see ref_count == 1 no other thread can be

> >> > trying to increment it via a concurrent odp_packet_ref() call because

> >> > that would mean that two threads were trying to manipulate the same

> >> > odp_packet_t, which is prohibited.

> >> >

> >> > For CPUs that support out of order instruction execution, this is

> only

> >> > permitted providing the reordering and speculative executions are

> >> > semantically consistent with sequential execution. If this were not

> >> > the case you'd constantly have to worry about a processor turning

> >> >

> >> > T1: interesting_data = read_from_pkt(pkt)

> >> > T1: packet_free(pkt)

> >> >

> >> > into

> >> >

> >> > T1: packet_free(pkt)

> >> > T1: interesting_data = read_from_pkt(pkt)

> >> >

> >> > In your scenario above: T2 cannot be issuing a read to pkt after

> >> > ref_count is decremented because the only way that T2 could be

> >> > decrementing ref_count would be if T2 issued an odp_packet_free()

> call

> >> > for it. Obviously if it tries to reference pkt after such a call that

> >> > is an application error.

> >> >

> >> > Thanks again for your much-appreciated help in looking at this!

> >> >

> >> >>

> >> >>         Janne

> >> >>

> >> >>
François Ozog Feb. 20, 2017, 5:02 p.m. UTC | #7
Thanks Petri,

You accurately summarized what we said in arch call today.

Bill, we'll cover the topic again on Wednesday. I have seen how it happened
behind the scenes for DPDK: 1 year architectural discussion of implications
(kind of background mode, coffee time), then little by little patch
additions bottom up (from pool changes to full API impact). I think we need
to be extra-careful with this complex topic.

Cordially,

FF



On 20 February 2017 at 16:01, Savolainen, Petri (Nokia - FI/Espoo) <
petri.savolainen@nokia-bell-labs.com> wrote:

> Hi,

>

> We are already in the phase where code in master should be maintained for

> production quality. There is no hurry to merge in code that has

> questionable quality. Zero copy packet references is not a trivial feature

> to implement. We are in much better position to review, test and use the

> reference code, if it's developed in phases. That's why I propose that we

> do multiple smaller steps:

>

> 1) merge simple, copy based implementation first (in api-next and master),

> which we can be sure that is does not break anything

> 2) write multi-threaded (performance) test apps for refs

> 3) cleanup, optimize normal packet code towards zero-copy multi-ref

> support (minimize places where refs are visible)

> 4) implement zero-copy multi-ref for most obvious use cases: maybe static

> references first ...

> 5) continue multi-ref implementation, or decide that some rare corner

> cases can be left with copy based implementation

>

> This actually follows what has been done before. Add simple, copy based

> implementation first and continue development/optimization from there. So,

> that we can step back and compare easily with previous version, if e.g.

> race conditions are found.

>

> -Petri

>

>

> > -----Original Message-----

> > From: lng-odp [mailto:lng-odp-bounces@lists.linaro.org] On Behalf Of

> Bill

> > Fischofer

> > Sent: Saturday, February 18, 2017 6:28 PM

> > To: Francois Ozog <francois.ozog@linaro.org>

> > Cc: lng-odp@lists.linaro.org

> > Subject: Re: [lng-odp] [API-NEXT PATCHv7 2/5] linux-generic: packet:

> > implement reference apis

> >

> > On Sat, Feb 18, 2017 at 9:57 AM, Francois Ozog <francois.ozog@linaro.org

> >

> > wrote:

> >

> > > Well, problem is still there.

> > > You are doing something on a packet that may not exist anymore.

> > >

> >

> > Can you elaborate? The bug fix patch eliminates the race condition that

> > Janne pointed out because no thread manipulates a packet after

> > decrementing

> > the ref_count other than to free it if that operation decremented the

> > ref_count to 0.

> >

> >

> > >

> > > On 17 February 2017 at 22:08, Bill Fischofer <

> bill.fischofer@linaro.org>

> > > wrote:

> > >

> > >> I've posted patch http://patches.opendataplane.org/patch/8155/ to

> > >> address this issue.  It goes on api-next on top of patches

> > >> http://patches.opendataplane.org/patch/7879/ and

> > >> http://patches.opendataplane.org/patch/8154/

> > >>

> > >> On Fri, Feb 17, 2017 at 2:39 PM, Bill Fischofer

> > >> <bill.fischofer@linaro.org> wrote:

> > >> > First off, thank you very much for this review.

> > >> >

> > >> > Please note that this code has been streamlined in patch

> > >> > http://patches.opendataplane.org/patch/7879/ and has been further

> > >> > refined with patch http://patches.opendataplane.org/patch/8145/ but

> > >> > the exposure you identify still exists in that code.

> > >> >

> > >> > On Fri, Feb 17, 2017 at 11:31 AM, Peltonen, Janne (Nokia - FI/Espoo)

> > >> > <janne.peltonen@nokia.com> wrote:

> > >> >> Hi,

> > >> >>

> > >> >> I took a look at the packet references and it seems to me that

> > >> >> either the implementation is a bit racy or I confused myself

> > >> >> when reading the code. Or maybe I got the intended concurrency

> > >> >> semantics of the packet references wrong?

> > >> >>

> > >> >> My first issue is that packet_free() may access freed packet

> > >> >> header or corrupt unshared_len.

> > >> >>

> > >> >> The packet free function looks like this:

> > >> >>

> > >> >> static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

> > >> >> {

> > >> >>         odp_packet_hdr_t *ref_hdr;

> > >> >>         uint32_t ref_count;

> > >> >>

> > >> >>         do {

> > >> >>                 ref_hdr = pkt_hdr->ref_hdr;

> > >> >>                 ref_count = packet_ref_count(pkt_hdr) - 1;

> > >> >>                 free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

> > >> >>

> > >> >>                 if (ref_count == 1)

> > >> >>                         pkt_hdr->unshared_len = pkt_hdr->frame_len;

> > >> >>

> > >> >>                 pkt_hdr = ref_hdr;

> > >> >>         } while (pkt_hdr);

> > >> >> }

> > >> >>

> > >> >> The problem here is that decrementing the ref_count, checking

> > >> >> its value and updating unshared_len is not single atomic

> > >> >> operation. By the time packet_free() checks if ref_count == 1

> > >> >> (i.e. if there is exactly one other reference left somewhere),

> > >> >> the true ref_count may have already been changed by another

> > >> >> thread doing packet_free() or packet_ref().

> > >> >>

> > >> >> For example, if two threads have a reference to the same packet

> > >> >> then execution (or the relevant memory ops) may get "interleaved"

> > >> >> as follows:

> > >> >>

> > >> >> T1: call packet_free()

> > >> >> T1: ref_count = packet_ref_count(pkt_hdr) - 1;

> > >> >> At this point ref_count variable is 1

> > >> >> T1: call free_bufs()

> > >> >> T1: call packet_ref_dec()

> > >> >> Now the ref_count of the packet header is 1.

> > >> >> T2: call and complete packet_free()

> > >> >> Thread 2 sees refcount 1 in the packet and frees the buffers

> > >> >> T1: pkt_hdr->unshared_len = pkt_hdr->frame_len;

> > >> >> Thread 1 accesses freed buffer for reading and writing.

> > >> >

> > >> > I agree. These steps should be reversed so that the code should

> read:

> > >> >

> > >> > if (ref_count == 1)

> > >> >    pkt_hdr->unshared_len = pkt_hdr->frame_len;

> > >> >

> > >> > free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

> > >> >

> > >> > Or using the code with the above two patches applied, the code

> should

> > >> read:

> > >> >

> > >> > static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

> > >> > {

> > >> >         odp_packet_hdr_t *ref_hdr;

> > >> >         uint32_t ref_count;

> > >> >         int num_seg;

> > >> >

> > >> >         do {

> > >> >                 ref_count = packet_ref_count(pkt_hdr);

> > >> >                 num_seg = pkt_hdr->buf_hdr.segcount;

> > >> >                 ref_hdr = pkt_hdr->ref_hdr;

> > >> >

> > >> >                 if (odp_likely((CONFIG_PACKET_MAX_SEGS == 1 ||

> > num_seg

> > >> == 1) &&

> > >> >                     ref_count == 1)) {

> > >> >                         buffer_free_multi((odp_buffer_t

> > >> > *)&pkt_hdr->buf_hdr.handle.handle, 1);

> > >> >                 } else {

> > >> >                         if (ref_count == 2)

> > >> >                                 pkt_hdr->unshared_len =

> > >> pkt_hdr->frame_len;

> > >> >

> > >> >                         free_bufs(pkt_hdr, 0, num_seg);

> > >> >                  }

> > >> >

> > >> >                  pkt_hdr = ref_hdr;

> > >> >         } while (pkt_hdr);

> > >> > }

> > >> >

> > >> > The mistake was trying to optimize things so that unshared_len is

> not

> > >> > set if the buffers are being freed, but that exposes these race

> > >> > conditions. So the worst that should now happen is that it is set

> > >> > unnecessarily before being freed.

> > >> >

> > >> > If you concur I'll fold this fix into a v3 for patch

> > >> > http://patches.opendataplane.org/patch/8145/

> > >> >

> > >> >>

> > >> >> Similarly, if T2 created a new reference, T1 would have

> > >> >> a wrong idea of the number of remaining references and

> > >> >> would adjust the unshared_len to an incorrect value.

> > >> >>

> > >> >> Right?

> > >> >>

> > >> >> Maybe other modifications of unshared_len are also racy.

> > >> >

> > >> > I don't believe so, because references do not change the existing

> ODP

> > >> > restriction that two threads cannot share the same odp_packet_t.

> > When

> > >> > a packet reference is created it returns a separate odp_packet_t

> that

> > >> > has its own metadata. So unshared_len is always private to an

> > >> > individual odp_packet_t. The exception is static references but in

> > >> > this case the entire

> > >> > packet along with its metadata must be treated as read only so

> > >> > operations like odp_packet_push_head() that would try to modify

> > >> > unshared_len are prohibited.

> > >> >

> > >> >>

> > >> >>

> > >> >>

> > >> >> The second issue is that the atomic ops for setting and

> > >> >> reading the ref count seem to have too relaxed memory

> > >> >> ordering. In particular, packet_ref_dec() must not happen

> > >> >> (be visible to other threads) before its caller is done

> > >> >> with the packet and the related memory accesses have

> > >> >> completed. Now there does not seem to be any optimization

> > >> >> and memory barrier to prevent the ref count decrementing

> > >> >> happening too early. So I think it is at least theoretically

> > >> >> possible that a thread e.g. reads from a packet buffer

> > >> >> after it has already been freed by another thread, somehow

> > >> >> like this:

> > >> >>

> > >> >> Source code order:

> > >> >> T1: interesting_data = read_from_pkt(pkt)

> > >> >> T1: packet_free(pkt)

> > >> >>

> > >> >> Order visible to T2:

> > >> >> 1: ref count decr

> > >> >> 2: read from pkt

> > >> >>

> > >> >> Now if T2 goes and frees the remaining reference between

> > >> >> steps 1 and 2, T1 may get even more interesting data.

> > >> >>

> > >> >> Right?

> > >> >

> > >> > I don't believe so. The semantics of odp_atomic_fetch_dec_u32(),

> > which

> > >> > is what packet_ref_dec() uses, says that no two calls can see the

> > same

> > >> > fetched value, so only one thread will return ref_count == 1 and

> free

> > >> > the buffer. Note that if I see ref_count == 1 no other thread can be

> > >> > trying to increment it via a concurrent odp_packet_ref() call

> because

> > >> > that would mean that two threads were trying to manipulate the same

> > >> > odp_packet_t, which is prohibited.

> > >> >

> > >> > For CPUs that support out of order instruction execution, this is

> > only

> > >> > permitted providing the reordering and speculative executions are

> > >> > semantically consistent with sequential execution. If this were not

> > >> > the case you'd constantly have to worry about a processor turning

> > >> >

> > >> > T1: interesting_data = read_from_pkt(pkt)

> > >> > T1: packet_free(pkt)

> > >> >

> > >> > into

> > >> >

> > >> > T1: packet_free(pkt)

> > >> > T1: interesting_data = read_from_pkt(pkt)

> > >> >

> > >> > In your scenario above: T2 cannot be issuing a read to pkt after

> > >> > ref_count is decremented because the only way that T2 could be

> > >> > decrementing ref_count would be if T2 issued an odp_packet_free()

> > call

> > >> > for it. Obviously if it tries to reference pkt after such a call

> that

> > >> > is an application error.

> > >> >

> > >> > Thanks again for your much-appreciated help in looking at this!

> > >> >

> > >> >>

> > >> >>         Janne

> > >> >>

> > >> >>

>




-- 
[image: Linaro] <http://www.linaro.org/>
François-Frédéric Ozog | *Director Linaro Networking Group*
T: +33.67221.6485
francois.ozog@linaro.org | Skype: ffozog
Bill Fischofer Feb. 20, 2017, 6:42 p.m. UTC | #8
Fair enough. If Nokia would like to write the multi-threaded stress-test
app to go with the benchmarking they are doing, I can restructure the
implementation into a series of stages that are more easily "digested".
What I don't want to have happen is this area get back-burnered with just a
general agreement that it will be worked on "sometime later". DPDK is
evolving quite rapidly these days and ODP cannot afford to be seen as
dragging our feet if we want to remain competitive.

References are something that a number of our members have been requesting
for some time and we've said they will be part of Tiger Moth, so I'd like
us to be responsive to those requests.

On Mon, Feb 20, 2017 at 11:02 AM, Francois Ozog <francois.ozog@linaro.org>
wrote:

> Thanks Petri,

>

> You accurately summarized what we said in arch call today.

>

> Bill, we'll cover the topic again on Wednesday. I have seen how it

> happened behind the scenes for DPDK: 1 year architectural discussion of

> implications (kind of background mode, coffee time), then little by little

> patch additions bottom up (from pool changes to full API impact). I think

> we need to be extra-careful with this complex topic.

>

> Cordially,

>

> FF

>

>

>

> On 20 February 2017 at 16:01, Savolainen, Petri (Nokia - FI/Espoo) <

> petri.savolainen@nokia-bell-labs.com> wrote:

>

>> Hi,

>>

>> We are already in the phase where code in master should be maintained for

>> production quality. There is no hurry to merge in code that has

>> questionable quality. Zero copy packet references is not a trivial feature

>> to implement. We are in much better position to review, test and use the

>> reference code, if it's developed in phases. That's why I propose that we

>> do multiple smaller steps:

>>

>> 1) merge simple, copy based implementation first (in api-next and

>> master), which we can be sure that is does not break anything

>> 2) write multi-threaded (performance) test apps for refs

>> 3) cleanup, optimize normal packet code towards zero-copy multi-ref

>> support (minimize places where refs are visible)

>> 4) implement zero-copy multi-ref for most obvious use cases: maybe static

>> references first ...

>> 5) continue multi-ref implementation, or decide that some rare corner

>> cases can be left with copy based implementation

>>

>> This actually follows what has been done before. Add simple, copy based

>> implementation first and continue development/optimization from there. So,

>> that we can step back and compare easily with previous version, if e.g.

>> race conditions are found.

>>

>> -Petri

>>

>>

>> > -----Original Message-----

>> > From: lng-odp [mailto:lng-odp-bounces@lists.linaro.org] On Behalf Of

>> Bill

>> > Fischofer

>> > Sent: Saturday, February 18, 2017 6:28 PM

>> > To: Francois Ozog <francois.ozog@linaro.org>

>> > Cc: lng-odp@lists.linaro.org

>> > Subject: Re: [lng-odp] [API-NEXT PATCHv7 2/5] linux-generic: packet:

>> > implement reference apis

>> >

>> > On Sat, Feb 18, 2017 at 9:57 AM, Francois Ozog <

>> francois.ozog@linaro.org>

>> > wrote:

>> >

>> > > Well, problem is still there.

>> > > You are doing something on a packet that may not exist anymore.

>> > >

>> >

>> > Can you elaborate? The bug fix patch eliminates the race condition that

>> > Janne pointed out because no thread manipulates a packet after

>> > decrementing

>> > the ref_count other than to free it if that operation decremented the

>> > ref_count to 0.

>> >

>> >

>> > >

>> > > On 17 February 2017 at 22:08, Bill Fischofer <

>> bill.fischofer@linaro.org>

>> > > wrote:

>> > >

>> > >> I've posted patch http://patches.opendataplane.org/patch/8155/ to

>> > >> address this issue.  It goes on api-next on top of patches

>> > >> http://patches.opendataplane.org/patch/7879/ and

>> > >> http://patches.opendataplane.org/patch/8154/

>> > >>

>> > >> On Fri, Feb 17, 2017 at 2:39 PM, Bill Fischofer

>> > >> <bill.fischofer@linaro.org> wrote:

>> > >> > First off, thank you very much for this review.

>> > >> >

>> > >> > Please note that this code has been streamlined in patch

>> > >> > http://patches.opendataplane.org/patch/7879/ and has been further

>> > >> > refined with patch http://patches.opendataplane.org/patch/8145/

>> but

>> > >> > the exposure you identify still exists in that code.

>> > >> >

>> > >> > On Fri, Feb 17, 2017 at 11:31 AM, Peltonen, Janne (Nokia -

>> FI/Espoo)

>> > >> > <janne.peltonen@nokia.com> wrote:

>> > >> >> Hi,

>> > >> >>

>> > >> >> I took a look at the packet references and it seems to me that

>> > >> >> either the implementation is a bit racy or I confused myself

>> > >> >> when reading the code. Or maybe I got the intended concurrency

>> > >> >> semantics of the packet references wrong?

>> > >> >>

>> > >> >> My first issue is that packet_free() may access freed packet

>> > >> >> header or corrupt unshared_len.

>> > >> >>

>> > >> >> The packet free function looks like this:

>> > >> >>

>> > >> >> static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

>> > >> >> {

>> > >> >>         odp_packet_hdr_t *ref_hdr;

>> > >> >>         uint32_t ref_count;

>> > >> >>

>> > >> >>         do {

>> > >> >>                 ref_hdr = pkt_hdr->ref_hdr;

>> > >> >>                 ref_count = packet_ref_count(pkt_hdr) - 1;

>> > >> >>                 free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

>> > >> >>

>> > >> >>                 if (ref_count == 1)

>> > >> >>                         pkt_hdr->unshared_len =

>> pkt_hdr->frame_len;

>> > >> >>

>> > >> >>                 pkt_hdr = ref_hdr;

>> > >> >>         } while (pkt_hdr);

>> > >> >> }

>> > >> >>

>> > >> >> The problem here is that decrementing the ref_count, checking

>> > >> >> its value and updating unshared_len is not single atomic

>> > >> >> operation. By the time packet_free() checks if ref_count == 1

>> > >> >> (i.e. if there is exactly one other reference left somewhere),

>> > >> >> the true ref_count may have already been changed by another

>> > >> >> thread doing packet_free() or packet_ref().

>> > >> >>

>> > >> >> For example, if two threads have a reference to the same packet

>> > >> >> then execution (or the relevant memory ops) may get "interleaved"

>> > >> >> as follows:

>> > >> >>

>> > >> >> T1: call packet_free()

>> > >> >> T1: ref_count = packet_ref_count(pkt_hdr) - 1;

>> > >> >> At this point ref_count variable is 1

>> > >> >> T1: call free_bufs()

>> > >> >> T1: call packet_ref_dec()

>> > >> >> Now the ref_count of the packet header is 1.

>> > >> >> T2: call and complete packet_free()

>> > >> >> Thread 2 sees refcount 1 in the packet and frees the buffers

>> > >> >> T1: pkt_hdr->unshared_len = pkt_hdr->frame_len;

>> > >> >> Thread 1 accesses freed buffer for reading and writing.

>> > >> >

>> > >> > I agree. These steps should be reversed so that the code should

>> read:

>> > >> >

>> > >> > if (ref_count == 1)

>> > >> >    pkt_hdr->unshared_len = pkt_hdr->frame_len;

>> > >> >

>> > >> > free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

>> > >> >

>> > >> > Or using the code with the above two patches applied, the code

>> should

>> > >> read:

>> > >> >

>> > >> > static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

>> > >> > {

>> > >> >         odp_packet_hdr_t *ref_hdr;

>> > >> >         uint32_t ref_count;

>> > >> >         int num_seg;

>> > >> >

>> > >> >         do {

>> > >> >                 ref_count = packet_ref_count(pkt_hdr);

>> > >> >                 num_seg = pkt_hdr->buf_hdr.segcount;

>> > >> >                 ref_hdr = pkt_hdr->ref_hdr;

>> > >> >

>> > >> >                 if (odp_likely((CONFIG_PACKET_MAX_SEGS == 1 ||

>> > num_seg

>> > >> == 1) &&

>> > >> >                     ref_count == 1)) {

>> > >> >                         buffer_free_multi((odp_buffer_t

>> > >> > *)&pkt_hdr->buf_hdr.handle.handle, 1);

>> > >> >                 } else {

>> > >> >                         if (ref_count == 2)

>> > >> >                                 pkt_hdr->unshared_len =

>> > >> pkt_hdr->frame_len;

>> > >> >

>> > >> >                         free_bufs(pkt_hdr, 0, num_seg);

>> > >> >                  }

>> > >> >

>> > >> >                  pkt_hdr = ref_hdr;

>> > >> >         } while (pkt_hdr);

>> > >> > }

>> > >> >

>> > >> > The mistake was trying to optimize things so that unshared_len is

>> not

>> > >> > set if the buffers are being freed, but that exposes these race

>> > >> > conditions. So the worst that should now happen is that it is set

>> > >> > unnecessarily before being freed.

>> > >> >

>> > >> > If you concur I'll fold this fix into a v3 for patch

>> > >> > http://patches.opendataplane.org/patch/8145/

>> > >> >

>> > >> >>

>> > >> >> Similarly, if T2 created a new reference, T1 would have

>> > >> >> a wrong idea of the number of remaining references and

>> > >> >> would adjust the unshared_len to an incorrect value.

>> > >> >>

>> > >> >> Right?

>> > >> >>

>> > >> >> Maybe other modifications of unshared_len are also racy.

>> > >> >

>> > >> > I don't believe so, because references do not change the existing

>> ODP

>> > >> > restriction that two threads cannot share the same odp_packet_t.

>> > When

>> > >> > a packet reference is created it returns a separate odp_packet_t

>> that

>> > >> > has its own metadata. So unshared_len is always private to an

>> > >> > individual odp_packet_t. The exception is static references but in

>> > >> > this case the entire

>> > >> > packet along with its metadata must be treated as read only so

>> > >> > operations like odp_packet_push_head() that would try to modify

>> > >> > unshared_len are prohibited.

>> > >> >

>> > >> >>

>> > >> >>

>> > >> >>

>> > >> >> The second issue is that the atomic ops for setting and

>> > >> >> reading the ref count seem to have too relaxed memory

>> > >> >> ordering. In particular, packet_ref_dec() must not happen

>> > >> >> (be visible to other threads) before its caller is done

>> > >> >> with the packet and the related memory accesses have

>> > >> >> completed. Now there does not seem to be any optimization

>> > >> >> and memory barrier to prevent the ref count decrementing

>> > >> >> happening too early. So I think it is at least theoretically

>> > >> >> possible that a thread e.g. reads from a packet buffer

>> > >> >> after it has already been freed by another thread, somehow

>> > >> >> like this:

>> > >> >>

>> > >> >> Source code order:

>> > >> >> T1: interesting_data = read_from_pkt(pkt)

>> > >> >> T1: packet_free(pkt)

>> > >> >>

>> > >> >> Order visible to T2:

>> > >> >> 1: ref count decr

>> > >> >> 2: read from pkt

>> > >> >>

>> > >> >> Now if T2 goes and frees the remaining reference between

>> > >> >> steps 1 and 2, T1 may get even more interesting data.

>> > >> >>

>> > >> >> Right?

>> > >> >

>> > >> > I don't believe so. The semantics of odp_atomic_fetch_dec_u32(),

>> > which

>> > >> > is what packet_ref_dec() uses, says that no two calls can see the

>> > same

>> > >> > fetched value, so only one thread will return ref_count == 1 and

>> free

>> > >> > the buffer. Note that if I see ref_count == 1 no other thread can

>> be

>> > >> > trying to increment it via a concurrent odp_packet_ref() call

>> because

>> > >> > that would mean that two threads were trying to manipulate the same

>> > >> > odp_packet_t, which is prohibited.

>> > >> >

>> > >> > For CPUs that support out of order instruction execution, this is

>> > only

>> > >> > permitted providing the reordering and speculative executions are

>> > >> > semantically consistent with sequential execution. If this were not

>> > >> > the case you'd constantly have to worry about a processor turning

>> > >> >

>> > >> > T1: interesting_data = read_from_pkt(pkt)

>> > >> > T1: packet_free(pkt)

>> > >> >

>> > >> > into

>> > >> >

>> > >> > T1: packet_free(pkt)

>> > >> > T1: interesting_data = read_from_pkt(pkt)

>> > >> >

>> > >> > In your scenario above: T2 cannot be issuing a read to pkt after

>> > >> > ref_count is decremented because the only way that T2 could be

>> > >> > decrementing ref_count would be if T2 issued an odp_packet_free()

>> > call

>> > >> > for it. Obviously if it tries to reference pkt after such a call

>> that

>> > >> > is an application error.

>> > >> >

>> > >> > Thanks again for your much-appreciated help in looking at this!

>> > >> >

>> > >> >>

>> > >> >>         Janne

>> > >> >>

>> > >> >>

>>

>

>

>

> --

> [image: Linaro] <http://www.linaro.org/>

> François-Frédéric Ozog | *Director Linaro Networking Group*

> T: +33.67221.6485

> francois.ozog@linaro.org | Skype: ffozog

>

>
Maxim Uvarov Feb. 20, 2017, 7:04 p.m. UTC | #9
I think proposed steps are:

- revert Bills patch;
- put implementation with copy to api-next, if it pass validation test
then merge merge it to master;
- then apply Bills patch again and later improvement patches to api-next;
- write stress stress test for master which will appear in api-next
because they are synced.
- then we say it's ready or other implementation appears then it will be
in master;

I.e. idea is to have some stable simple version which pass validation
tests including multi threaded examples as base. That will be in master.

And performance variant will be in api-next. It should be easy to
compare api-next and master variant we somebody (nokia?) will catch
unclear fails.

Looks like in that case we do not hold anybody work.

Maxim.


On 02/20/17 21:42, Bill Fischofer wrote:
> Fair enough. If Nokia would like to write the multi-threaded stress-test

> app to go with the benchmarking they are doing, I can restructure the

> implementation into a series of stages that are more easily "digested".

> What I don't want to have happen is this area get back-burnered with just a

> general agreement that it will be worked on "sometime later". DPDK is

> evolving quite rapidly these days and ODP cannot afford to be seen as

> dragging our feet if we want to remain competitive.

> 

> References are something that a number of our members have been requesting

> for some time and we've said they will be part of Tiger Moth, so I'd like

> us to be responsive to those requests.

> 

> On Mon, Feb 20, 2017 at 11:02 AM, Francois Ozog <francois.ozog@linaro.org>

> wrote:

> 

>> Thanks Petri,

>>

>> You accurately summarized what we said in arch call today.

>>

>> Bill, we'll cover the topic again on Wednesday. I have seen how it

>> happened behind the scenes for DPDK: 1 year architectural discussion of

>> implications (kind of background mode, coffee time), then little by little

>> patch additions bottom up (from pool changes to full API impact). I think

>> we need to be extra-careful with this complex topic.

>>

>> Cordially,

>>

>> FF

>>

>>

>>

>> On 20 February 2017 at 16:01, Savolainen, Petri (Nokia - FI/Espoo) <

>> petri.savolainen@nokia-bell-labs.com> wrote:

>>

>>> Hi,

>>>

>>> We are already in the phase where code in master should be maintained for

>>> production quality. There is no hurry to merge in code that has

>>> questionable quality. Zero copy packet references is not a trivial feature

>>> to implement. We are in much better position to review, test and use the

>>> reference code, if it's developed in phases. That's why I propose that we

>>> do multiple smaller steps:

>>>

>>> 1) merge simple, copy based implementation first (in api-next and

>>> master), which we can be sure that is does not break anything

>>> 2) write multi-threaded (performance) test apps for refs

>>> 3) cleanup, optimize normal packet code towards zero-copy multi-ref

>>> support (minimize places where refs are visible)

>>> 4) implement zero-copy multi-ref for most obvious use cases: maybe static

>>> references first ...

>>> 5) continue multi-ref implementation, or decide that some rare corner

>>> cases can be left with copy based implementation

>>>

>>> This actually follows what has been done before. Add simple, copy based

>>> implementation first and continue development/optimization from there. So,

>>> that we can step back and compare easily with previous version, if e.g.

>>> race conditions are found.

>>>

>>> -Petri

>>>

>>>

>>>> -----Original Message-----

>>>> From: lng-odp [mailto:lng-odp-bounces@lists.linaro.org] On Behalf Of

>>> Bill

>>>> Fischofer

>>>> Sent: Saturday, February 18, 2017 6:28 PM

>>>> To: Francois Ozog <francois.ozog@linaro.org>

>>>> Cc: lng-odp@lists.linaro.org

>>>> Subject: Re: [lng-odp] [API-NEXT PATCHv7 2/5] linux-generic: packet:

>>>> implement reference apis

>>>>

>>>> On Sat, Feb 18, 2017 at 9:57 AM, Francois Ozog <

>>> francois.ozog@linaro.org>

>>>> wrote:

>>>>

>>>>> Well, problem is still there.

>>>>> You are doing something on a packet that may not exist anymore.

>>>>>

>>>>

>>>> Can you elaborate? The bug fix patch eliminates the race condition that

>>>> Janne pointed out because no thread manipulates a packet after

>>>> decrementing

>>>> the ref_count other than to free it if that operation decremented the

>>>> ref_count to 0.

>>>>

>>>>

>>>>>

>>>>> On 17 February 2017 at 22:08, Bill Fischofer <

>>> bill.fischofer@linaro.org>

>>>>> wrote:

>>>>>

>>>>>> I've posted patch http://patches.opendataplane.org/patch/8155/ to

>>>>>> address this issue.  It goes on api-next on top of patches

>>>>>> http://patches.opendataplane.org/patch/7879/ and

>>>>>> http://patches.opendataplane.org/patch/8154/

>>>>>>

>>>>>> On Fri, Feb 17, 2017 at 2:39 PM, Bill Fischofer

>>>>>> <bill.fischofer@linaro.org> wrote:

>>>>>>> First off, thank you very much for this review.

>>>>>>>

>>>>>>> Please note that this code has been streamlined in patch

>>>>>>> http://patches.opendataplane.org/patch/7879/ and has been further

>>>>>>> refined with patch http://patches.opendataplane.org/patch/8145/

>>> but

>>>>>>> the exposure you identify still exists in that code.

>>>>>>>

>>>>>>> On Fri, Feb 17, 2017 at 11:31 AM, Peltonen, Janne (Nokia -

>>> FI/Espoo)

>>>>>>> <janne.peltonen@nokia.com> wrote:

>>>>>>>> Hi,

>>>>>>>>

>>>>>>>> I took a look at the packet references and it seems to me that

>>>>>>>> either the implementation is a bit racy or I confused myself

>>>>>>>> when reading the code. Or maybe I got the intended concurrency

>>>>>>>> semantics of the packet references wrong?

>>>>>>>>

>>>>>>>> My first issue is that packet_free() may access freed packet

>>>>>>>> header or corrupt unshared_len.

>>>>>>>>

>>>>>>>> The packet free function looks like this:

>>>>>>>>

>>>>>>>> static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

>>>>>>>> {

>>>>>>>>         odp_packet_hdr_t *ref_hdr;

>>>>>>>>         uint32_t ref_count;

>>>>>>>>

>>>>>>>>         do {

>>>>>>>>                 ref_hdr = pkt_hdr->ref_hdr;

>>>>>>>>                 ref_count = packet_ref_count(pkt_hdr) - 1;

>>>>>>>>                 free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

>>>>>>>>

>>>>>>>>                 if (ref_count == 1)

>>>>>>>>                         pkt_hdr->unshared_len =

>>> pkt_hdr->frame_len;

>>>>>>>>

>>>>>>>>                 pkt_hdr = ref_hdr;

>>>>>>>>         } while (pkt_hdr);

>>>>>>>> }

>>>>>>>>

>>>>>>>> The problem here is that decrementing the ref_count, checking

>>>>>>>> its value and updating unshared_len is not single atomic

>>>>>>>> operation. By the time packet_free() checks if ref_count == 1

>>>>>>>> (i.e. if there is exactly one other reference left somewhere),

>>>>>>>> the true ref_count may have already been changed by another

>>>>>>>> thread doing packet_free() or packet_ref().

>>>>>>>>

>>>>>>>> For example, if two threads have a reference to the same packet

>>>>>>>> then execution (or the relevant memory ops) may get "interleaved"

>>>>>>>> as follows:

>>>>>>>>

>>>>>>>> T1: call packet_free()

>>>>>>>> T1: ref_count = packet_ref_count(pkt_hdr) - 1;

>>>>>>>> At this point ref_count variable is 1

>>>>>>>> T1: call free_bufs()

>>>>>>>> T1: call packet_ref_dec()

>>>>>>>> Now the ref_count of the packet header is 1.

>>>>>>>> T2: call and complete packet_free()

>>>>>>>> Thread 2 sees refcount 1 in the packet and frees the buffers

>>>>>>>> T1: pkt_hdr->unshared_len = pkt_hdr->frame_len;

>>>>>>>> Thread 1 accesses freed buffer for reading and writing.

>>>>>>>

>>>>>>> I agree. These steps should be reversed so that the code should

>>> read:

>>>>>>>

>>>>>>> if (ref_count == 1)

>>>>>>>    pkt_hdr->unshared_len = pkt_hdr->frame_len;

>>>>>>>

>>>>>>> free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

>>>>>>>

>>>>>>> Or using the code with the above two patches applied, the code

>>> should

>>>>>> read:

>>>>>>>

>>>>>>> static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

>>>>>>> {

>>>>>>>         odp_packet_hdr_t *ref_hdr;

>>>>>>>         uint32_t ref_count;

>>>>>>>         int num_seg;

>>>>>>>

>>>>>>>         do {

>>>>>>>                 ref_count = packet_ref_count(pkt_hdr);

>>>>>>>                 num_seg = pkt_hdr->buf_hdr.segcount;

>>>>>>>                 ref_hdr = pkt_hdr->ref_hdr;

>>>>>>>

>>>>>>>                 if (odp_likely((CONFIG_PACKET_MAX_SEGS == 1 ||

>>>> num_seg

>>>>>> == 1) &&

>>>>>>>                     ref_count == 1)) {

>>>>>>>                         buffer_free_multi((odp_buffer_t

>>>>>>> *)&pkt_hdr->buf_hdr.handle.handle, 1);

>>>>>>>                 } else {

>>>>>>>                         if (ref_count == 2)

>>>>>>>                                 pkt_hdr->unshared_len =

>>>>>> pkt_hdr->frame_len;

>>>>>>>

>>>>>>>                         free_bufs(pkt_hdr, 0, num_seg);

>>>>>>>                  }

>>>>>>>

>>>>>>>                  pkt_hdr = ref_hdr;

>>>>>>>         } while (pkt_hdr);

>>>>>>> }

>>>>>>>

>>>>>>> The mistake was trying to optimize things so that unshared_len is

>>> not

>>>>>>> set if the buffers are being freed, but that exposes these race

>>>>>>> conditions. So the worst that should now happen is that it is set

>>>>>>> unnecessarily before being freed.

>>>>>>>

>>>>>>> If you concur I'll fold this fix into a v3 for patch

>>>>>>> http://patches.opendataplane.org/patch/8145/

>>>>>>>

>>>>>>>>

>>>>>>>> Similarly, if T2 created a new reference, T1 would have

>>>>>>>> a wrong idea of the number of remaining references and

>>>>>>>> would adjust the unshared_len to an incorrect value.

>>>>>>>>

>>>>>>>> Right?

>>>>>>>>

>>>>>>>> Maybe other modifications of unshared_len are also racy.

>>>>>>>

>>>>>>> I don't believe so, because references do not change the existing

>>> ODP

>>>>>>> restriction that two threads cannot share the same odp_packet_t.

>>>> When

>>>>>>> a packet reference is created it returns a separate odp_packet_t

>>> that

>>>>>>> has its own metadata. So unshared_len is always private to an

>>>>>>> individual odp_packet_t. The exception is static references but in

>>>>>>> this case the entire

>>>>>>> packet along with its metadata must be treated as read only so

>>>>>>> operations like odp_packet_push_head() that would try to modify

>>>>>>> unshared_len are prohibited.

>>>>>>>

>>>>>>>>

>>>>>>>>

>>>>>>>>

>>>>>>>> The second issue is that the atomic ops for setting and

>>>>>>>> reading the ref count seem to have too relaxed memory

>>>>>>>> ordering. In particular, packet_ref_dec() must not happen

>>>>>>>> (be visible to other threads) before its caller is done

>>>>>>>> with the packet and the related memory accesses have

>>>>>>>> completed. Now there does not seem to be any optimization

>>>>>>>> and memory barrier to prevent the ref count decrementing

>>>>>>>> happening too early. So I think it is at least theoretically

>>>>>>>> possible that a thread e.g. reads from a packet buffer

>>>>>>>> after it has already been freed by another thread, somehow

>>>>>>>> like this:

>>>>>>>>

>>>>>>>> Source code order:

>>>>>>>> T1: interesting_data = read_from_pkt(pkt)

>>>>>>>> T1: packet_free(pkt)

>>>>>>>>

>>>>>>>> Order visible to T2:

>>>>>>>> 1: ref count decr

>>>>>>>> 2: read from pkt

>>>>>>>>

>>>>>>>> Now if T2 goes and frees the remaining reference between

>>>>>>>> steps 1 and 2, T1 may get even more interesting data.

>>>>>>>>

>>>>>>>> Right?

>>>>>>>

>>>>>>> I don't believe so. The semantics of odp_atomic_fetch_dec_u32(),

>>>> which

>>>>>>> is what packet_ref_dec() uses, says that no two calls can see the

>>>> same

>>>>>>> fetched value, so only one thread will return ref_count == 1 and

>>> free

>>>>>>> the buffer. Note that if I see ref_count == 1 no other thread can

>>> be

>>>>>>> trying to increment it via a concurrent odp_packet_ref() call

>>> because

>>>>>>> that would mean that two threads were trying to manipulate the same

>>>>>>> odp_packet_t, which is prohibited.

>>>>>>>

>>>>>>> For CPUs that support out of order instruction execution, this is

>>>> only

>>>>>>> permitted providing the reordering and speculative executions are

>>>>>>> semantically consistent with sequential execution. If this were not

>>>>>>> the case you'd constantly have to worry about a processor turning

>>>>>>>

>>>>>>> T1: interesting_data = read_from_pkt(pkt)

>>>>>>> T1: packet_free(pkt)

>>>>>>>

>>>>>>> into

>>>>>>>

>>>>>>> T1: packet_free(pkt)

>>>>>>> T1: interesting_data = read_from_pkt(pkt)

>>>>>>>

>>>>>>> In your scenario above: T2 cannot be issuing a read to pkt after

>>>>>>> ref_count is decremented because the only way that T2 could be

>>>>>>> decrementing ref_count would be if T2 issued an odp_packet_free()

>>>> call

>>>>>>> for it. Obviously if it tries to reference pkt after such a call

>>> that

>>>>>>> is an application error.

>>>>>>>

>>>>>>> Thanks again for your much-appreciated help in looking at this!

>>>>>>>

>>>>>>>>

>>>>>>>>         Janne

>>>>>>>>

>>>>>>>>

>>>

>>

>>

>>

>> --

>> [image: Linaro] <http://www.linaro.org/>

>> François-Frédéric Ozog | *Director Linaro Networking Group*

>> T: +33.67221.6485

>> francois.ozog@linaro.org | Skype: ffozog

>>

>>
Peltonen, Janne (Nokia - FI/Espoo) Feb. 20, 2017, 8:14 p.m. UTC | #10
Hi,

Comments below.

	Janne

> -----Original Message-----

> From: Bill Fischofer [mailto:bill.fischofer@linaro.org]

> Sent: Friday, February 17, 2017 10:39 PM

> To: Peltonen, Janne (Nokia - FI/Espoo) <janne.peltonen@nokia.com>

> Cc: lng-odp@lists.linaro.org

> Subject: Re: [lng-odp] [API-NEXT PATCHv7 2/5] linux-generic: packet: implement reference

> apis

> 

> First off, thank you very much for this review.

> 

> Please note that this code has been streamlined in patch

> http://patches.opendataplane.org/patch/7879/ and has been further

> refined with patch http://patches.opendataplane.org/patch/8145/ but

> the exposure you identify still exists in that code.

> 

> On Fri, Feb 17, 2017 at 11:31 AM, Peltonen, Janne (Nokia - FI/Espoo)

> <janne.peltonen@nokia.com> wrote:

> > Hi,

> >

> > I took a look at the packet references and it seems to me that

> > either the implementation is a bit racy or I confused myself

> > when reading the code. Or maybe I got the intended concurrency

> > semantics of the packet references wrong?

> >

> > My first issue is that packet_free() may access freed packet

> > header or corrupt unshared_len.

> >

> > The packet free function looks like this:

> >

> > static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

> > {

> >         odp_packet_hdr_t *ref_hdr;

> >         uint32_t ref_count;

> >

> >         do {

> >                 ref_hdr = pkt_hdr->ref_hdr;

> >                 ref_count = packet_ref_count(pkt_hdr) - 1;

> >                 free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

> >

> >                 if (ref_count == 1)

> >                         pkt_hdr->unshared_len = pkt_hdr->frame_len;

> >

> >                 pkt_hdr = ref_hdr;

> >         } while (pkt_hdr);

> > }

> >

> > The problem here is that decrementing the ref_count, checking

> > its value and updating unshared_len is not single atomic

> > operation. By the time packet_free() checks if ref_count == 1

> > (i.e. if there is exactly one other reference left somewhere),

> > the true ref_count may have already been changed by another

> > thread doing packet_free() or packet_ref().

> >

> > For example, if two threads have a reference to the same packet

> > then execution (or the relevant memory ops) may get "interleaved"

> > as follows:

> >

> > T1: call packet_free()

> > T1: ref_count = packet_ref_count(pkt_hdr) - 1;

> > At this point ref_count variable is 1

> > T1: call free_bufs()

> > T1: call packet_ref_dec()

> > Now the ref_count of the packet header is 1.

> > T2: call and complete packet_free()

> > Thread 2 sees refcount 1 in the packet and frees the buffers

> > T1: pkt_hdr->unshared_len = pkt_hdr->frame_len;

> > Thread 1 accesses freed buffer for reading and writing.

> 

> I agree. These steps should be reversed so that the code should read:

> 

> if (ref_count == 1)

>    pkt_hdr->unshared_len = pkt_hdr->frame_len;

> 

> free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

> 

> Or using the code with the above two patches applied, the code should read:

> 

> static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

> {

>         odp_packet_hdr_t *ref_hdr;

>         uint32_t ref_count;

>         int num_seg;

> 

>         do {

>                 ref_count = packet_ref_count(pkt_hdr);

>                 num_seg = pkt_hdr->buf_hdr.segcount;

>                 ref_hdr = pkt_hdr->ref_hdr;

> 

>                 if (odp_likely((CONFIG_PACKET_MAX_SEGS == 1 || num_seg == 1) &&

>                     ref_count == 1)) {

>                         buffer_free_multi((odp_buffer_t

> *)&pkt_hdr->buf_hdr.handle.handle, 1);

>                 } else {

>                         if (ref_count == 2)

>                                 pkt_hdr->unshared_len = pkt_hdr->frame_len;

> 

>                         free_bufs(pkt_hdr, 0, num_seg);

>                  }

> 

>                  pkt_hdr = ref_hdr;

>         } while (pkt_hdr);

> }

> 

> The mistake was trying to optimize things so that unshared_len is not

> set if the buffers are being freed, but that exposes these race

> conditions. So the worst that should now happen is that it is set

> unnecessarily before being freed.

> 

> If you concur I'll fold this fix into a v3 for patch

> http://patches.opendataplane.org/patch/8145/


I commented separately the patch you sent.

> 

> >

> > Similarly, if T2 created a new reference, T1 would have

> > a wrong idea of the number of remaining references and

> > would adjust the unshared_len to an incorrect value.

> >

> > Right?

> >

> > Maybe other modifications of unshared_len are also racy.

> 

> I don't believe so, because references do not change the existing ODP

> restriction that two threads cannot share the same odp_packet_t.  When

> a packet reference is created it returns a separate odp_packet_t that

> has its own metadata. So unshared_len is always private to an

> individual odp_packet_t.


Yes, the same odp_packet_t must not be used concurrently by many threads
(ignoring static references for now), but in the implementation the
odp_packet_hdr_t of the base packet is accessed and used by all references
that may be held in separate threads.

odp_packet_unshared_len() uses the unshared_len field of the first shared
packet header, not the packet header that was created for the reference.
Packet allocation and freeing both adjust the unshared_len of a shared
packet header.

So, the base packet headers really do seem to be shared even across
threads even if the odp_packet_t handles as seen by the application were
not.

Suppose I have the base packet buffer B in thread 1 and a reference R1
to it in thread 2. If thread 1 does push_head(B, ...) then B->frame_len
and B->unshared_len are both read and written by thread 1. If simultaneously
thread 2 does R2 = odp_packet_ref(R1, ...), then B->frame_len is read
and B->unshared_len is read and written by thread 2. This looks like
a data race to me.

> The exception is static references but in

> this case the entire

> packet along with its metadata must be treated as read only so

> operations like odp_packet_push_head() that would try to modify

> unshared_len are prohibited.

> 

> >

> >

> >

> > The second issue is that the atomic ops for setting and

> > reading the ref count seem to have too relaxed memory

> > ordering. In particular, packet_ref_dec() must not happen

> > (be visible to other threads) before its caller is done

> > with the packet and the related memory accesses have

> > completed. Now there does not seem to be any optimization

> > and memory barrier to prevent the ref count decrementing

> > happening too early. So I think it is at least theoretically

> > possible that a thread e.g. reads from a packet buffer

> > after it has already been freed by another thread, somehow

> > like this:

> >

> > Source code order:

> > T1: interesting_data = read_from_pkt(pkt)

> > T1: packet_free(pkt)

> >

> > Order visible to T2:

> > 1: ref count decr

> > 2: read from pkt

> >

> > Now if T2 goes and frees the remaining reference between

> > steps 1 and 2, T1 may get even more interesting data.

> >

> > Right?

> 

> I don't believe so. The semantics of odp_atomic_fetch_dec_u32(), which

> is what packet_ref_dec() uses, says that no two calls can see the same

> fetched value, so only one thread will return ref_count == 1 and free

> the buffer. Note that if I see ref_count == 1 no other thread can be

> trying to increment it via a concurrent odp_packet_ref() call


Yes, but the point was about how the other memory accesses are
visible to the other thread relative the atomic refcount adjustment.
Since odp_atomic_fecth_dec_u32() has relaxed memory ordering as far
as I can see, nothing is guaranteed on _when_ the atomic decrement
is going to be visible to other threads and whether other, non-atomic
reads and writes become visible to other threads before of after
the atomic decrement.

> because

> that would mean that two threads were trying to manipulate the same

> odp_packet_t, which is prohibited.

> 

> For CPUs that support out of order instruction execution, this is only

> permitted providing the reordering and speculative executions are

> semantically consistent with sequential execution. If this were not

> the case you'd constantly have to worry about a processor turning


This is so when you consider just one core, but with multiple cores
the other cores may not see reads and writes occur in the same order
as they appear in the machine code. But even without the possibility
of non-sequential memory ordering in the CPU, the compiler might
reorder the operations if it thinks (in the absence of synchronization
operations) that for single thread the results would be the same.

> 

> T1: interesting_data = read_from_pkt(pkt)

> T1: packet_free(pkt)

> 

> into

> 

> T1: packet_free(pkt)

> T1: interesting_data = read_from_pkt(pkt)

> 

> In your scenario above: T2 cannot be issuing a read to pkt after

> ref_count is decremented because the only way that T2 could be

> decrementing ref_count would be if T2 issued an odp_packet_free() call

> for it. Obviously if it tries to reference pkt after such a call that

> is an application error.


The memory load for read_from_pkt and the atomic fetch and dec for
packet_free() can appear in any order in the machine code output by the
compiler and/or in the memory bus operations done by the CPU core since
they access different memory locations and there are no optimization
barriers or any synchronization operations restricting the reordering.

> 

> Thanks again for your much-appreciated help in looking at this!

> 

> >

> >         Janne

> >

> >

> >> -----Original Message-----

> >> From: lng-odp [mailto:lng-odp-bounces@lists.linaro.org] On Behalf Of Bill Fischofer

> >> Sent: Wednesday, January 11, 2017 4:34 AM

> >> To: lng-odp@lists.linaro.org

> >> Subject: [lng-odp] [API-NEXT PATCHv7 2/5] linux-generic: packet: implement reference

> apis

> >>

> >> Implement the APIs:

> >> - odp_packet_ref_static()

> >> - odp_packet_ref()

> >> - odp_packet_ref_pkt()

> >> - odp_packet_has_ref()

> >> - odp_packet_unshared_len()

> >>

> >> This also involves functional upgrades to the existing packet manipulation

> >> APIs to work with packet references as input arguments.

> >>

> >> Signed-off-by: Bill Fischofer <bill.fischofer@linaro.org>

> >> ---

> >>  .../linux-generic/include/odp_packet_internal.h    |  87 +++-

> >>  platform/linux-generic/odp_packet.c                | 536 +++++++++++++++++----

> >>  2 files changed, 516 insertions(+), 107 deletions(-)

> >>

> >> diff --git a/platform/linux-generic/include/odp_packet_internal.h b/platform/linux-

> >> generic/include/odp_packet_internal.h

> >> index e6e9d74..607560d 100644

> >> --- a/platform/linux-generic/include/odp_packet_internal.h

> >> +++ b/platform/linux-generic/include/odp_packet_internal.h

> >> @@ -19,6 +19,7 @@ extern "C" {

> >>

> >>  #include <odp/api/align.h>

> >>  #include <odp/api/debug.h>

> >> +#include <odp_debug_internal.h>

> >>  #include <odp_buffer_internal.h>

> >>  #include <odp_pool_internal.h>

> >>  #include <odp_buffer_inlines.h>

> >> @@ -168,7 +169,7 @@ typedef struct {

> >>   * packet_init(). Because of this any new fields added must be reviewed for

> >>   * initialization requirements.

> >>   */

> >> -typedef struct {

> >> +typedef struct odp_packet_hdr_t {

> >>       /* common buffer header */

> >>       odp_buffer_hdr_t buf_hdr;

> >>

> >> @@ -184,6 +185,13 @@ typedef struct {

> >>       uint32_t headroom;

> >>       uint32_t tailroom;

> >>

> >> +     /* Fields used to support packet references */

> >> +     uint32_t unshared_len;

> >> +     struct odp_packet_hdr_t *ref_hdr;

> >> +     uint32_t ref_offset;

> >> +     uint32_t ref_len;

> >> +     odp_atomic_u32_t ref_count;

> >> +

> >>       /*

> >>        * Members below are not initialized by packet_init()

> >>        */

> >> @@ -212,6 +220,55 @@ static inline odp_packet_hdr_t *odp_packet_hdr(odp_packet_t pkt)

> >>       return (odp_packet_hdr_t *)buf_hdl_to_hdr((odp_buffer_t)pkt);

> >>  }

> >>

> >> +static inline odp_packet_hdr_t *odp_packet_last_hdr(odp_packet_t pkt,

> >> +                                                 uint32_t *offset)

> >> +{

> >> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >> +     odp_packet_hdr_t *prev_hdr = pkt_hdr;

> >> +     uint32_t ref_offset = 0;

> >> +

> >> +     while (pkt_hdr->ref_hdr) {

> >> +             ref_offset = pkt_hdr->ref_offset;

> >> +             prev_hdr   = pkt_hdr;

> >> +             pkt_hdr    = pkt_hdr->ref_hdr;

> >> +     }

> >> +

> >> +     if (offset) {

> >> +             if (prev_hdr != pkt_hdr)

> >> +                     ref_offset += pkt_hdr->frame_len - prev_hdr->ref_len;

> >> +             *offset = ref_offset;

> >> +     }

> >> +

> >> +     return pkt_hdr;

> >> +}

> >> +

> >> +static inline odp_packet_hdr_t *odp_packet_prev_hdr(odp_packet_hdr_t *pkt_hdr,

> >> +                                                 odp_packet_hdr_t *cur_hdr,

> >> +                                                 uint32_t *offset)

> >> +{

> >> +     uint32_t ref_offset = 0;

> >> +     odp_packet_hdr_t *prev_hdr = pkt_hdr;

> >> +

> >> +     while (pkt_hdr->ref_hdr != cur_hdr) {

> >> +             ref_offset = pkt_hdr->ref_offset;

> >> +             prev_hdr   = pkt_hdr;

> >> +             pkt_hdr    = pkt_hdr->ref_hdr;

> >> +     }

> >> +

> >> +     if (offset) {

> >> +             if (prev_hdr != pkt_hdr)

> >> +                     ref_offset += pkt_hdr->frame_len - prev_hdr->ref_len;

> >> +             *offset = ref_offset;

> >> +     }

> >> +

> >> +     return pkt_hdr;

> >> +}

> >> +

> >> +static inline odp_packet_t _odp_packet_hdl(odp_packet_hdr_t *pkt_hdr)

> >> +{

> >> +     return (odp_packet_t)odp_hdr_to_buf(&pkt_hdr->buf_hdr);

> >> +}

> >> +

> >>  static inline void copy_packet_parser_metadata(odp_packet_hdr_t *src_hdr,

> >>                                              odp_packet_hdr_t *dst_hdr)

> >>  {

> >> @@ -234,17 +291,43 @@ static inline void pull_tail(odp_packet_hdr_t *pkt_hdr, uint32_t

> >> len)

> >>

> >>       pkt_hdr->tailroom  += len;

> >>       pkt_hdr->frame_len -= len;

> >> +     pkt_hdr->unshared_len -= len;

> >>       pkt_hdr->buf_hdr.seg[last].len -= len;

> >>  }

> >>

> >>  static inline uint32_t packet_len(odp_packet_hdr_t *pkt_hdr)

> >>  {

> >> -     return pkt_hdr->frame_len;

> >> +     uint32_t pkt_len = 0;

> >> +     uint32_t offset  = 0;

> >> +

> >> +     do {

> >> +             pkt_len += pkt_hdr->frame_len - offset;

> >> +             offset   = pkt_hdr->ref_offset;

> >> +             if (pkt_hdr->ref_hdr)

> >> +                     offset += (pkt_hdr->ref_hdr->frame_len -

> >> +                                pkt_hdr->ref_len);

> >> +             pkt_hdr  = pkt_hdr->ref_hdr;

> >> +     } while (pkt_hdr);

> >> +

> >> +     return pkt_len;

> >> +}

> >> +

> >> +static inline uint32_t packet_ref_count(odp_packet_hdr_t *pkt_hdr)

> >> +{

> >> +     return odp_atomic_load_u32(&pkt_hdr->ref_count);

> >> +}

> >> +

> >> +static inline void packet_ref_count_set(odp_packet_hdr_t *pkt_hdr, uint32_t n)

> >> +{

> >> +     odp_atomic_init_u32(&pkt_hdr->ref_count, n);

> >>  }

> >>

> >>  static inline void packet_set_len(odp_packet_hdr_t *pkt_hdr, uint32_t len)

> >>  {

> >> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

> >> +

> >>       pkt_hdr->frame_len = len;

> >> +     pkt_hdr->unshared_len = len;

> >>  }

> >>

> >>  static inline int packet_parse_l2_not_done(packet_parser_t *prs)

> >> diff --git a/platform/linux-generic/odp_packet.c b/platform/linux-generic/odp_packet.c

> >> index f632a51..170965a 100644

> >> --- a/platform/linux-generic/odp_packet.c

> >> +++ b/platform/linux-generic/odp_packet.c

> >> @@ -33,13 +33,24 @@ static inline odp_buffer_t buffer_handle(odp_packet_hdr_t *pkt_hdr)

> >>       return pkt_hdr->buf_hdr.handle.handle;

> >>  }

> >>

> >> +static inline uint32_t packet_ref_inc(odp_packet_hdr_t *pkt_hdr)

> >> +{

> >> +     return odp_atomic_fetch_inc_u32(&pkt_hdr->ref_count);

> >> +}

> >> +

> >> +static inline uint32_t packet_ref_dec(odp_packet_hdr_t *pkt_hdr)

> >> +{

> >> +     return odp_atomic_fetch_dec_u32(&pkt_hdr->ref_count);

> >> +}

> >> +

> >>  static inline uint32_t packet_seg_len(odp_packet_hdr_t *pkt_hdr,

> >>                                     uint32_t seg_idx)

> >>  {

> >>       return pkt_hdr->buf_hdr.seg[seg_idx].len;

> >>  }

> >>

> >> -static inline void *packet_seg_data(odp_packet_hdr_t *pkt_hdr, uint32_t seg_idx)

> >> +static inline uint8_t *packet_seg_data(odp_packet_hdr_t *pkt_hdr,

> >> +                                    uint32_t seg_idx)

> >>  {

> >>       return pkt_hdr->buf_hdr.seg[seg_idx].data;

> >>  }

> >> @@ -52,6 +63,11 @@ static inline int packet_last_seg(odp_packet_hdr_t *pkt_hdr)

> >>               return pkt_hdr->buf_hdr.segcount - 1;

> >>  }

> >>

> >> +static inline void *packet_data(odp_packet_hdr_t *pkt_hdr)

> >> +{

> >> +     return pkt_hdr->buf_hdr.seg[0].data;

> >> +}

> >> +

> >>  static inline uint32_t packet_first_seg_len(odp_packet_hdr_t *pkt_hdr)

> >>  {

> >>       return packet_seg_len(pkt_hdr, 0);

> >> @@ -64,11 +80,6 @@ static inline uint32_t packet_last_seg_len(odp_packet_hdr_t

> *pkt_hdr)

> >>       return packet_seg_len(pkt_hdr, last);

> >>  }

> >>

> >> -static inline void *packet_data(odp_packet_hdr_t *pkt_hdr)

> >> -{

> >> -     return pkt_hdr->buf_hdr.seg[0].data;

> >> -}

> >> -

> >>  static inline void *packet_tail(odp_packet_hdr_t *pkt_hdr)

> >>  {

> >>       int last = packet_last_seg(pkt_hdr);

> >> @@ -99,6 +110,7 @@ static inline void push_head(odp_packet_hdr_t *pkt_hdr, uint32_t

> len)

> >>  {

> >>       pkt_hdr->headroom  -= len;

> >>       pkt_hdr->frame_len += len;

> >> +     pkt_hdr->unshared_len += len;

> >>       pkt_hdr->buf_hdr.seg[0].data -= len;

> >>       pkt_hdr->buf_hdr.seg[0].len  += len;

> >>  }

> >> @@ -107,6 +119,7 @@ static inline void pull_head(odp_packet_hdr_t *pkt_hdr, uint32_t

> len)

> >>  {

> >>       pkt_hdr->headroom  += len;

> >>       pkt_hdr->frame_len -= len;

> >> +     pkt_hdr->unshared_len -= len;

> >>       pkt_hdr->buf_hdr.seg[0].data += len;

> >>       pkt_hdr->buf_hdr.seg[0].len  -= len;

> >>  }

> >> @@ -117,6 +130,7 @@ static inline void push_tail(odp_packet_hdr_t *pkt_hdr, uint32_t

> len)

> >>

> >>       pkt_hdr->tailroom  -= len;

> >>       pkt_hdr->frame_len += len;

> >> +     pkt_hdr->unshared_len += len;

> >>       pkt_hdr->buf_hdr.seg[last].len += len;

> >>  }

> >>

> >> @@ -144,6 +158,10 @@ static inline void packet_seg_copy_md(odp_packet_hdr_t *dst,

> >>       dst->buf_hdr.uarea_addr = src->buf_hdr.uarea_addr;

> >>       dst->buf_hdr.uarea_size = src->buf_hdr.uarea_size;

> >>

> >> +     /* reference related metadata */

> >> +     dst->ref_len      = src->ref_len;

> >> +     dst->unshared_len = src->unshared_len;

> >> +

> >>       /* segmentation data is not copied:

> >>        *   buf_hdr.seg[]

> >>        *   buf_hdr.segcount

> >> @@ -158,7 +176,15 @@ static inline void *packet_map(odp_packet_hdr_t *pkt_hdr,

> >>       int seg = 0;

> >>       int seg_count = pkt_hdr->buf_hdr.segcount;

> >>

> >> -     if (odp_unlikely(offset >= pkt_hdr->frame_len))

> >> +     /* Special processing for references */

> >> +     while (offset >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

> >> +             offset   -= (pkt_hdr->frame_len - pkt_hdr->ref_offset);

> >> +             offset   += (pkt_hdr->ref_hdr->frame_len - pkt_hdr->ref_len);

> >> +             pkt_hdr   = pkt_hdr->ref_hdr;

> >> +             seg_count = pkt_hdr->buf_hdr.segcount;

> >> +     }

> >> +

> >> +     if (odp_unlikely(offset > pkt_hdr->frame_len))

> >>               return NULL;

> >>

> >>       if (odp_likely(CONFIG_PACKET_MAX_SEGS == 1 || seg_count == 1)) {

> >> @@ -207,6 +233,9 @@ void packet_parse_reset(odp_packet_hdr_t *pkt_hdr)

> >>       pkt_hdr->p.l2_offset        = 0;

> >>       pkt_hdr->p.l3_offset        = ODP_PACKET_OFFSET_INVALID;

> >>       pkt_hdr->p.l4_offset        = ODP_PACKET_OFFSET_INVALID;

> >> +

> >> +     /* Ensure dummy pkt_hdrs used in I/O recv classification are valid */

> >> +     pkt_hdr->ref_hdr = NULL;

> >>  }

> >>

> >>  /**

> >> @@ -252,6 +281,10 @@ static inline void packet_init(odp_packet_hdr_t *pkt_hdr, uint32_t

> >> len,

> >>                            CONFIG_PACKET_TAILROOM;

> >>

> >>       pkt_hdr->input = ODP_PKTIO_INVALID;

> >> +

> >> +     /* By default packet has no references */

> >> +     pkt_hdr->unshared_len = len;

> >> +     pkt_hdr->ref_hdr = NULL;

> >>  }

> >>

> >>  static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int num)

> >> @@ -264,6 +297,7 @@ static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int

> num)

> >>

> >>       hdr->buf_hdr.seg[0].data = hdr->buf_hdr.base_data;

> >>       hdr->buf_hdr.seg[0].len  = BASE_LEN;

> >> +     packet_ref_count_set(hdr, 1);

> >>

> >>       /* Link segments */

> >>       if (CONFIG_PACKET_MAX_SEGS != 1) {

> >> @@ -273,6 +307,7 @@ static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int

> num)

> >>                       for (i = 1; i < num; i++) {

> >>                               odp_buffer_hdr_t *buf_hdr;

> >>

> >> +                             packet_ref_count_set(pkt_hdr[i], 1);

> >>                               buf_hdr = &pkt_hdr[i]->buf_hdr;

> >>                               hdr->buf_hdr.seg[i].hdr  = buf_hdr;

> >>                               hdr->buf_hdr.seg[i].data = buf_hdr->base_data;

> >> @@ -376,9 +411,10 @@ static inline odp_packet_hdr_t *add_segments(odp_packet_hdr_t

> >> *pkt_hdr,

> >>               new_hdr->buf_hdr.seg[0].len   = seg_len;

> >>

> >>               packet_seg_copy_md(new_hdr, pkt_hdr);

> >> -             new_hdr->frame_len = pkt_hdr->frame_len + len;

> >> -             new_hdr->headroom  = pool->headroom + offset;

> >> -             new_hdr->tailroom  = pkt_hdr->tailroom;

> >> +             new_hdr->frame_len    = pkt_hdr->frame_len + len;

> >> +             new_hdr->unshared_len = pkt_hdr->unshared_len + len;

> >> +             new_hdr->headroom     = pool->headroom + offset;

> >> +             new_hdr->tailroom     = pkt_hdr->tailroom;

> >>

> >>               pkt_hdr = new_hdr;

> >>       } else {

> >> @@ -391,8 +427,9 @@ static inline odp_packet_hdr_t *add_segments(odp_packet_hdr_t

> >> *pkt_hdr,

> >>               last = packet_last_seg(pkt_hdr);

> >>               pkt_hdr->buf_hdr.seg[last].len = seg_len;

> >>

> >> -             pkt_hdr->frame_len += len;

> >> -             pkt_hdr->tailroom   = pool->tailroom + offset;

> >> +             pkt_hdr->frame_len    += len;

> >> +             pkt_hdr->unshared_len += len;

> >> +             pkt_hdr->tailroom      = pool->tailroom + offset;

> >>       }

> >>

> >>       return pkt_hdr;

> >> @@ -400,13 +437,18 @@ static inline odp_packet_hdr_t *add_segments(odp_packet_hdr_t

> >> *pkt_hdr,

> >>

> >>  static inline void free_bufs(odp_packet_hdr_t *pkt_hdr, int first, int num)

> >>  {

> >> -     int i;

> >> +     int i, nfree;

> >>       odp_buffer_t buf[num];

> >>

> >> -     for (i = 0; i < num; i++)

> >> -             buf[i] = buffer_handle(pkt_hdr->buf_hdr.seg[first + i].hdr);

> >> +     for (i = 0, nfree = 0; i < num; i++) {

> >> +             odp_packet_hdr_t *hdr = pkt_hdr->buf_hdr.seg[first + i].hdr;

> >> +

> >> +             if (packet_ref_dec(hdr) == 1)

> >> +                     buf[nfree++] = buffer_handle(hdr);

> >> +     }

> >>

> >> -     buffer_free_multi(buf, num);

> >> +     if (nfree > 0)

> >> +             buffer_free_multi(buf, nfree);

> >>  }

> >>

> >>  static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t *pkt_hdr,

> >> @@ -417,11 +459,15 @@ static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t

> >> *pkt_hdr,

> >>

> >>       if (head) {

> >>               odp_packet_hdr_t *new_hdr;

> >> -             int i;

> >> +             int i, nfree;

> >>               odp_buffer_t buf[num];

> >>

> >> -             for (i = 0; i < num; i++)

> >> -                     buf[i] = buffer_handle(pkt_hdr->buf_hdr.seg[i].hdr);

> >> +             for (i = 0, nfree = 0; i < num; i++) {

> >> +                     new_hdr = pkt_hdr->buf_hdr.seg[i].hdr;

> >> +

> >> +                     if (packet_ref_dec(new_hdr) == 1)

> >> +                             buf[nfree++] = buffer_handle(new_hdr);

> >> +             }

> >>

> >>               /* First remaining segment is the new packet descriptor */

> >>               new_hdr = pkt_hdr->buf_hdr.seg[num].hdr;

> >> @@ -430,15 +476,17 @@ static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t

> >> *pkt_hdr,

> >>               packet_seg_copy_md(new_hdr, pkt_hdr);

> >>

> >>               /* Tailroom not changed */

> >> -             new_hdr->tailroom  = pkt_hdr->tailroom;

> >> -             new_hdr->headroom  = seg_headroom(new_hdr, 0);

> >> -             new_hdr->frame_len = pkt_hdr->frame_len - free_len;

> >> +             new_hdr->tailroom     = pkt_hdr->tailroom;

> >> +             new_hdr->headroom     = seg_headroom(new_hdr, 0);

> >> +             new_hdr->frame_len    = pkt_hdr->frame_len - free_len;

> >> +             new_hdr->unshared_len = pkt_hdr->unshared_len - free_len;

> >>

> >>               pull_head(new_hdr, pull_len);

> >>

> >>               pkt_hdr = new_hdr;

> >>

> >> -             buffer_free_multi(buf, num);

> >> +             if (nfree > 0)

> >> +                     buffer_free_multi(buf, nfree);

> >>       } else {

> >>               /* Free last 'num' bufs */

> >>               free_bufs(pkt_hdr, num_remain, num);

> >> @@ -447,6 +495,7 @@ static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t

> >> *pkt_hdr,

> >>                * of the metadata. */

> >>               pkt_hdr->buf_hdr.segcount = num_remain;

> >>               pkt_hdr->frame_len -= free_len;

> >> +             pkt_hdr->unshared_len -= free_len;

> >>               pkt_hdr->tailroom = seg_tailroom(pkt_hdr, num_remain - 1);

> >>

> >>               pull_tail(pkt_hdr, pull_len);

> >> @@ -550,45 +599,34 @@ int odp_packet_alloc_multi(odp_pool_t pool_hdl, uint32_t len,

> >>       return num;

> >>  }

> >>

> >> -void odp_packet_free(odp_packet_t pkt)

> >> +static inline void packet_free(odp_packet_hdr_t *pkt_hdr)

> >>  {

> >> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >> -     int num_seg = pkt_hdr->buf_hdr.segcount;

> >> +     odp_packet_hdr_t *ref_hdr;

> >> +     uint32_t ref_count;

> >>

> >> -     if (odp_likely(CONFIG_PACKET_MAX_SEGS == 1 || num_seg == 1))

> >> -             buffer_free_multi((odp_buffer_t *)&pkt, 1);

> >> -     else

> >> -             free_bufs(pkt_hdr, 0, num_seg);

> >> -}

> >> +     do {

> >> +             ref_hdr = pkt_hdr->ref_hdr;

> >> +             ref_count = packet_ref_count(pkt_hdr) - 1;

> >> +             free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);

> >>

> >> -void odp_packet_free_multi(const odp_packet_t pkt[], int num)

> >> -{

> >> -     if (CONFIG_PACKET_MAX_SEGS == 1) {

> >> -             buffer_free_multi((const odp_buffer_t * const)pkt, num);

> >> -     } else {

> >> -             odp_buffer_t buf[num * CONFIG_PACKET_MAX_SEGS];

> >> -             int i, j;

> >> -             int bufs = 0;

> >> +             if (ref_count == 1)

> >> +                     pkt_hdr->unshared_len = pkt_hdr->frame_len;

> >>

> >> -             for (i = 0; i < num; i++) {

> >> -                     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt[i]);

> >> -                     int num_seg = pkt_hdr->buf_hdr.segcount;

> >> -                     odp_buffer_hdr_t *buf_hdr = &pkt_hdr->buf_hdr;

> >> -

> >> -                     buf[bufs] = (odp_buffer_t)pkt[i];

> >> -                     bufs++;

> >> +             pkt_hdr = ref_hdr;

> >> +     } while (pkt_hdr);

> >> +}

> >>

> >> -                     if (odp_likely(num_seg == 1))

> >> -                             continue;

> >> +void odp_packet_free(odp_packet_t pkt)

> >> +{

> >> +     packet_free(odp_packet_hdr(pkt));

> >> +}

> >>

> >> -                     for (j = 1; j < num_seg; j++) {

> >> -                             buf[bufs] = buffer_handle(buf_hdr->seg[j].hdr);

> >> -                             bufs++;

> >> -                     }

> >> -             }

> >> +void odp_packet_free_multi(const odp_packet_t pkt[], int num)

> >> +{

> >> +     int i;

> >>

> >> -             buffer_free_multi(buf, bufs);

> >> -     }

> >> +     for (i = 0; i < num; i++)

> >> +             packet_free(odp_packet_hdr(pkt[i]));

> >>  }

> >>

> >>  int odp_packet_reset(odp_packet_t pkt, uint32_t len)

> >> @@ -599,6 +637,9 @@ int odp_packet_reset(odp_packet_t pkt, uint32_t len)

> >>       if (len > pool->headroom + pool->data_size + pool->tailroom)

> >>               return -1;

> >>

> >> +     if (pkt_hdr->ref_hdr)

> >> +             packet_free(pkt_hdr->ref_hdr);

> >> +

> >>       packet_init(pkt_hdr, len, 0);

> >>

> >>       return 0;

> >> @@ -641,15 +682,21 @@ void *odp_packet_head(odp_packet_t pkt)

> >>  uint32_t odp_packet_buf_len(odp_packet_t pkt)

> >>  {

> >>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >> +     uint32_t buf_len = 0;

> >>

> >> -     return pkt_hdr->buf_hdr.size * pkt_hdr->buf_hdr.segcount;

> >> +     do {

> >> +             buf_len += pkt_hdr->buf_hdr.size * pkt_hdr->buf_hdr.segcount;

> >> +             pkt_hdr  = pkt_hdr->ref_hdr;

> >> +     } while (pkt_hdr);

> >> +

> >> +     return buf_len;

> >>  }

> >>

> >>  void *odp_packet_data(odp_packet_t pkt)

> >>  {

> >>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>

> >> -     return packet_data(pkt_hdr);

> >> +     return packet_map(pkt_hdr, 0, NULL, NULL);

> >>  }

> >>

> >>  uint32_t odp_packet_seg_len(odp_packet_t pkt)

> >> @@ -661,7 +708,32 @@ uint32_t odp_packet_seg_len(odp_packet_t pkt)

> >>

> >>  uint32_t odp_packet_len(odp_packet_t pkt)

> >>  {

> >> -     return odp_packet_hdr(pkt)->frame_len;

> >> +     return packet_len(odp_packet_hdr(pkt));

> >> +}

> >> +

> >> +uint32_t odp_packet_unshared_len(odp_packet_t pkt)

> >> +{

> >> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >> +     uint32_t pkt_len = 0, offset = 0;

> >> +

> >> +     do {

> >> +             if (packet_ref_count(pkt_hdr) > 1) {

> >> +                     if (offset == 0)

> >> +                             pkt_len += pkt_hdr->unshared_len;

> >> +                     break;

> >> +             }

> >> +

> >> +             pkt_len += pkt_hdr->frame_len - offset;

> >> +             offset   = pkt_hdr->ref_offset;

> >> +

> >> +             if (pkt_hdr->ref_hdr)

> >> +                     offset += (pkt_hdr->ref_hdr->frame_len -

> >> +                                pkt_hdr->ref_len);

> >> +

> >> +             pkt_hdr = pkt_hdr->ref_hdr;

> >> +     } while (pkt_hdr);

> >> +

> >> +     return pkt_len;

> >>  }

> >>

> >>  uint32_t odp_packet_headroom(odp_packet_t pkt)

> >> @@ -671,12 +743,12 @@ uint32_t odp_packet_headroom(odp_packet_t pkt)

> >>

> >>  uint32_t odp_packet_tailroom(odp_packet_t pkt)

> >>  {

> >> -     return odp_packet_hdr(pkt)->tailroom;

> >> +     return odp_packet_last_hdr(pkt, NULL)->tailroom;

> >>  }

> >>

> >>  void *odp_packet_tail(odp_packet_t pkt)

> >>  {

> >> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >> +     odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(pkt, NULL);

> >>

> >>       return packet_tail(pkt_hdr);

> >>  }

> >> @@ -870,7 +942,7 @@ int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,

> >>  {

> >>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

> >>       uint32_t frame_len = pkt_hdr->frame_len;

> >> -     uint32_t headroom  = pkt_hdr->headroom;

> >> +     uint32_t headroom = pkt_hdr->headroom;

> >>       int ret = 0;

> >>

> >>       if (len > headroom) {

> >> @@ -885,6 +957,46 @@ int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,

> >>               segs = pkt_hdr->buf_hdr.segcount;

> >>

> >>               if (odp_unlikely((segs + num) > CONFIG_PACKET_MAX_SEGS)) {

> >> +                     /* Handle recursively via references when

> >> +                      * working with referenced packets since another

> >> +                      * thread may be accessing it concurrently via

> >> +                      * its reference to it. */

> >> +                     if (packet_ref_count(pkt_hdr) > 1) {

> >> +                             odp_packet_t ref;

> >> +                             uint32_t unshared_len;

> >> +

> >> +                             push_head(pkt_hdr, headroom);

> >> +                             unshared_len = pkt_hdr->unshared_len;

> >> +                             ref = odp_packet_ref(*pkt, 0);

> >> +

> >> +                             if (ref == ODP_PACKET_INVALID) {

> >> +                                     pull_head(pkt_hdr, headroom);

> >> +                                     return -1;

> >> +                             }

> >> +

> >> +                             ret = odp_packet_extend_head(&ref,

> >> +                                                          len - headroom,

> >> +                                                          data_ptr,

> >> +                                                          seg_len);

> >> +

> >> +                             if (ret < 0) {

> >> +                                     odp_packet_free(ref);

> >> +                                     pull_head(pkt_hdr, headroom);

> >> +                                     return -1;

> >> +                             }

> >> +

> >> +                             /* Since this is a special ref, the

> >> +                              * base pkt's unshared len is unchanged */

> >> +                             pkt_hdr->unshared_len = unshared_len;

> >> +

> >> +                             /* Remove extra ref to the base pkt */

> >> +                             odp_packet_free(*pkt);

> >> +

> >> +                             /* Return the ref as the extension result */

> >> +                             *pkt = ref;

> >> +                             return 1;

> >> +                     }

> >> +

> >>                       /* Cannot directly add new segments */

> >>                       odp_packet_hdr_t *new_hdr;

> >>                       int new_segs = 0;

> >> @@ -936,6 +1048,7 @@ int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,

> >>

> >>                       pkt_hdr->buf_hdr.segcount = segs;

> >>                       pkt_hdr->frame_len        = frame_len;

> >> +                     pkt_hdr->unshared_len     = frame_len;

> >>                       pkt_hdr->headroom         = offset + pool->headroom;

> >>                       pkt_hdr->tailroom         = pool->tailroom;

> >>

> >> @@ -961,11 +1074,16 @@ int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,

> >>               push_head(pkt_hdr, len);

> >>       }

> >>

> >> -     if (data_ptr)

> >> -             *data_ptr = packet_data(pkt_hdr);

> >> +     if (data_ptr || seg_len) {

> >> +             uint32_t seg_ln = 0;

> >> +             void *data = packet_map(pkt_hdr, 0, &seg_ln, NULL);

> >>

> >> -     if (seg_len)

> >> -             *seg_len = packet_first_seg_len(pkt_hdr);

> >> +             if (data_ptr)

> >> +                     *data_ptr = data;

> >> +

> >> +             if (seg_len)

> >> +                     *seg_len = seg_ln;

> >> +     }

> >>

> >>       return ret;

> >>  }

> >> @@ -977,6 +1095,8 @@ void *odp_packet_pull_head(odp_packet_t pkt, uint32_t len)

> >>       if (len > pkt_hdr->frame_len)

> >>               return NULL;

> >>

> >> +     ODP_ASSERT(len <= pkt_hdr->unshared_len);

> >> +

> >>       pull_head(pkt_hdr, len);

> >>       return packet_data(pkt_hdr);

> >>  }

> >> @@ -984,15 +1104,35 @@ void *odp_packet_pull_head(odp_packet_t pkt, uint32_t len)

> >>  int odp_packet_trunc_head(odp_packet_t *pkt, uint32_t len,

> >>                         void **data_ptr, uint32_t *seg_len_out)

> >>  {

> >> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

> >> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt), *nxt_hdr;

> >>       uint32_t seg_len = packet_first_seg_len(pkt_hdr);

> >> +     int ret = 0;

> >>

> >> -     if (len > pkt_hdr->frame_len)

> >> +     if (len > packet_len(pkt_hdr))

> >>               return -1;

> >>

> >> -     if (len < seg_len) {

> >> +     ODP_ASSERT(len <= odp_packet_unshared_len(*pkt));

> >> +

> >> +     /* Special processing for references */

> >> +     while (len >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

> >> +             ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

> >> +             nxt_hdr = pkt_hdr->ref_hdr;

> >> +             len -= pkt_hdr->frame_len;

> >> +             len += pkt_hdr->ref_offset +

> >> +                     (nxt_hdr->frame_len - pkt_hdr->ref_len);

> >> +             pkt_hdr->ref_hdr = NULL;

> >> +             packet_free(pkt_hdr);

> >> +             pkt_hdr = nxt_hdr;

> >> +             seg_len = packet_first_seg_len(pkt_hdr);

> >> +             *pkt = packet_handle(pkt_hdr);

> >> +             ret = 1;

> >> +     }

> >> +

> >> +     if (CONFIG_PACKET_MAX_SEGS == 1 ||

> >> +         len < seg_len ||

> >> +         pkt_hdr->buf_hdr.segcount == 1) {

> >>               pull_head(pkt_hdr, len);

> >> -     } else if (CONFIG_PACKET_MAX_SEGS != 1) {

> >> +     } else {

> >>               int num = 0;

> >>               uint32_t pull_len = 0;

> >>

> >> @@ -1007,23 +1147,29 @@ int odp_packet_trunc_head(odp_packet_t *pkt, uint32_t len,

> >>               *pkt    = packet_handle(pkt_hdr);

> >>       }

> >>

> >> -     if (data_ptr)

> >> -             *data_ptr = packet_data(pkt_hdr);

> >> +     if (data_ptr || seg_len_out) {

> >> +             void *data_head = packet_map(pkt_hdr, 0, &seg_len, NULL);

> >>

> >> -     if (seg_len_out)

> >> -             *seg_len_out = packet_first_seg_len(pkt_hdr);

> >> +             if (data_ptr)

> >> +                     *data_ptr = data_head;

> >>

> >> -     return 0;

> >> +             if (seg_len_out)

> >> +                     *seg_len_out = seg_len;

> >> +     }

> >> +

> >> +     return ret;

> >>  }

> >>

> >>  void *odp_packet_push_tail(odp_packet_t pkt, uint32_t len)

> >>  {

> >> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >> +     odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(pkt, NULL);

> >>       void *old_tail;

> >>

> >>       if (len > pkt_hdr->tailroom)

> >>               return NULL;

> >>

> >> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

> >> +

> >>       old_tail = packet_tail(pkt_hdr);

> >>       push_tail(pkt_hdr, len);

> >>

> >> @@ -1033,12 +1179,14 @@ void *odp_packet_push_tail(odp_packet_t pkt, uint32_t len)

> >>  int odp_packet_extend_tail(odp_packet_t *pkt, uint32_t len,

> >>                          void **data_ptr, uint32_t *seg_len_out)

> >>  {

> >> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

> >> +     odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(*pkt, NULL);

> >>       uint32_t frame_len = pkt_hdr->frame_len;

> >>       uint32_t tailroom  = pkt_hdr->tailroom;

> >>       uint32_t tail_off  = frame_len;

> >>       int ret = 0;

> >>

> >> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

> >> +

> >>       if (len > tailroom) {

> >>               pool_t *pool = pool_entry_from_hdl(pkt_hdr->buf_hdr.pool_hdl);

> >>               int num;

> >> @@ -1129,6 +1277,7 @@ void *odp_packet_pull_tail(odp_packet_t pkt, uint32_t len)

> >>       if (len > packet_last_seg_len(pkt_hdr))

> >>               return NULL;

> >>

> >> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

> >>       pull_tail(pkt_hdr, len);

> >>

> >>       return packet_tail(pkt_hdr);

> >> @@ -1139,17 +1288,34 @@ int odp_packet_trunc_tail(odp_packet_t *pkt, uint32_t len,

> >>  {

> >>       int last;

> >>       uint32_t seg_len;

> >> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);

> >> +     uint32_t offset;

> >> +     odp_packet_hdr_t *first_hdr = odp_packet_hdr(*pkt);

> >> +     odp_packet_hdr_t *pkt_hdr, *prev_hdr;

> >>

> >> -     if (len > pkt_hdr->frame_len)

> >> +     if (len > packet_len(first_hdr))

> >>               return -1;

> >>

> >> +     pkt_hdr = odp_packet_last_hdr(*pkt, &offset);

> >> +

> >> +     /* Special processing for references */

> >> +     while (len >= pkt_hdr->frame_len - offset && first_hdr->ref_hdr) {

> >> +             len -= (pkt_hdr->frame_len - offset);

> >> +             prev_hdr = odp_packet_prev_hdr(first_hdr, pkt_hdr, &offset);

> >> +             ODP_ASSERT(packet_ref_count(prev_hdr) == 1);

> >> +             prev_hdr->ref_hdr = NULL;

> >> +             packet_free(pkt_hdr);

> >> +             pkt_hdr = prev_hdr;

> >> +     }

> >> +

> >> +     ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);

> >>       last    = packet_last_seg(pkt_hdr);

> >>       seg_len = packet_seg_len(pkt_hdr, last);

> >>

> >> -     if (len < seg_len) {

> >> +     if (CONFIG_PACKET_MAX_SEGS == 1 ||

> >> +         len < seg_len ||

> >> +         pkt_hdr->buf_hdr.segcount == 1) {

> >>               pull_tail(pkt_hdr, len);

> >> -     } else if (CONFIG_PACKET_MAX_SEGS != 1) {

> >> +     } else {

> >>               int num = 0;

> >>               uint32_t pull_len = 0;

> >>

> >> @@ -1356,35 +1522,50 @@ void odp_packet_ts_set(odp_packet_t pkt, odp_time_t timestamp)

> >>

> >>  int odp_packet_is_segmented(odp_packet_t pkt)

> >>  {

> >> -     return odp_packet_hdr(pkt)->buf_hdr.segcount > 1;

> >> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >> +

> >> +     return pkt_hdr->buf_hdr.segcount > 1 || pkt_hdr->ref_hdr != NULL;

> >>  }

> >>

> >>  int odp_packet_num_segs(odp_packet_t pkt)

> >>  {

> >>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >> +     uint32_t segcount = 0, i;

> >> +     uint32_t seg_offset = 0, offset;

> >> +

> >> +     do {

> >> +             segcount += pkt_hdr->buf_hdr.segcount - seg_offset;

> >> +             offset    = pkt_hdr->ref_offset;

> >> +             pkt_hdr   = pkt_hdr->ref_hdr;

> >> +             if (pkt_hdr) {

> >> +                     for (i = 0, seg_offset = 0;

> >> +                          i < pkt_hdr->buf_hdr.segcount;

> >> +                          i++, seg_offset++) {

> >> +                             if (offset < pkt_hdr->buf_hdr.seg[i].len)

> >> +                                     break;

> >> +                             offset -= pkt_hdr->buf_hdr.seg[i].len;

> >> +                     }

> >> +             }

> >> +     } while (pkt_hdr);

> >>

> >> -     return pkt_hdr->buf_hdr.segcount;

> >> +     return segcount;

> >>  }

> >>

> >> -odp_packet_seg_t odp_packet_first_seg(odp_packet_t pkt)

> >> +odp_packet_seg_t odp_packet_first_seg(odp_packet_t pkt ODP_UNUSED)

> >>  {

> >> -     (void)pkt;

> >> -

> >>       return 0;

> >>  }

> >>

> >>  odp_packet_seg_t odp_packet_last_seg(odp_packet_t pkt)

> >>  {

> >> -     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >> -

> >> -     return packet_last_seg(pkt_hdr);

> >> +     return (odp_packet_seg_t)(odp_packet_num_segs(pkt) - 1);

> >>  }

> >>

> >>  odp_packet_seg_t odp_packet_next_seg(odp_packet_t pkt, odp_packet_seg_t seg)

> >>  {

> >>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>

> >> -     if (odp_unlikely(seg >= (odp_packet_seg_t)packet_last_seg(pkt_hdr)))

> >> +     if (odp_unlikely(seg >= packet_last_seg(pkt_hdr)))

> >>               return ODP_PACKET_SEG_INVALID;

> >>

> >>       return seg + 1;

> >> @@ -1400,21 +1581,51 @@ odp_packet_seg_t odp_packet_next_seg(odp_packet_t pkt,

> >> odp_packet_seg_t seg)

> >>  void *odp_packet_seg_data(odp_packet_t pkt, odp_packet_seg_t seg)

> >>  {

> >>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >> +     uint32_t seg_offset = 0, offset = 0, i;

> >> +

> >> +     while (seg >= pkt_hdr->buf_hdr.segcount - seg_offset &&

> >> +            pkt_hdr->ref_hdr) {

> >> +             seg    -= (pkt_hdr->buf_hdr.segcount - seg_offset);

> >> +             offset  = pkt_hdr->ref_offset;

> >> +             pkt_hdr = pkt_hdr->ref_hdr;

> >> +             for (i = 0, seg_offset = 0;

> >> +                  i < pkt_hdr->buf_hdr.segcount;

> >> +                  i++, seg_offset++) {

> >> +                     if (offset < pkt_hdr->buf_hdr.seg[i].len)

> >> +                             break;

> >> +                     offset -= pkt_hdr->buf_hdr.seg[i].len;

> >> +             }

> >> +     }

> >>

> >> -     if (odp_unlikely(seg >= pkt_hdr->buf_hdr.segcount))

> >> +     if (odp_unlikely(seg + seg_offset >= pkt_hdr->buf_hdr.segcount))

> >>               return NULL;

> >>

> >> -     return packet_seg_data(pkt_hdr, seg);

> >> +     return packet_seg_data(pkt_hdr, seg + seg_offset) + offset;

> >>  }

> >>

> >>  uint32_t odp_packet_seg_data_len(odp_packet_t pkt, odp_packet_seg_t seg)

> >>  {

> >>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >> +     uint32_t seg_offset = 0, offset = 0, i;

> >> +

> >> +     while (seg >= pkt_hdr->buf_hdr.segcount - seg_offset &&

> >> +            pkt_hdr->ref_hdr) {

> >> +             seg    -= (pkt_hdr->buf_hdr.segcount - seg_offset);

> >> +             offset  = pkt_hdr->ref_offset;

> >> +             pkt_hdr = pkt_hdr->ref_hdr;

> >> +             for (i = 0, seg_offset = 0;

> >> +                  i < pkt_hdr->buf_hdr.segcount;

> >> +                  i++, seg_offset++) {

> >> +                     if (offset < pkt_hdr->buf_hdr.seg[i].len)

> >> +                             break;

> >> +                     offset -= pkt_hdr->buf_hdr.seg[i].len;

> >> +             }

> >> +     }

> >>

> >> -     if (odp_unlikely(seg >= pkt_hdr->buf_hdr.segcount))

> >> +     if (odp_unlikely(seg + seg_offset >= pkt_hdr->buf_hdr.segcount))

> >>               return 0;

> >>

> >> -     return packet_seg_len(pkt_hdr, seg);

> >> +     return packet_seg_len(pkt_hdr, seg + seg_offset) - offset;

> >>  }

> >>

> >>  /*

> >> @@ -1428,12 +1639,14 @@ int odp_packet_add_data(odp_packet_t *pkt_ptr, uint32_t offset,

> >> uint32_t len)

> >>  {

> >>       odp_packet_t pkt = *pkt_ptr;

> >>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >> -     uint32_t pktlen = pkt_hdr->frame_len;

> >> +     uint32_t pktlen = packet_len(pkt_hdr);

> >>       odp_packet_t newpkt;

> >>

> >>       if (offset > pktlen)

> >>               return -1;

> >>

> >> +     ODP_ASSERT(odp_packet_unshared_len(*pkt_ptr) >= offset);

> >> +

> >>       newpkt = odp_packet_alloc(pkt_hdr->buf_hdr.pool_hdl, pktlen + len);

> >>

> >>       if (newpkt == ODP_PACKET_INVALID)

> >> @@ -1496,6 +1709,8 @@ int odp_packet_align(odp_packet_t *pkt, uint32_t offset, uint32_t

> >> len,

> >>       if (align > ODP_CACHE_LINE_SIZE)

> >>               return -1;

> >>

> >> +     ODP_ASSERT(odp_packet_has_ref(*pkt) == 0);

> >> +

> >>       if (seglen >= len) {

> >>               misalign = align <= 1 ? 0 :

> >>                       ODP_ALIGN_ROUNDUP(uaddr, align) - uaddr;

> >> @@ -1535,10 +1750,13 @@ int odp_packet_concat(odp_packet_t *dst, odp_packet_t src)

> >>       uint32_t dst_len    = dst_hdr->frame_len;

> >>       uint32_t src_len    = src_hdr->frame_len;

> >>

> >> +     ODP_ASSERT(packet_ref_count(dst_hdr) == 1);

> >> +

> >>       /* Do a copy if resulting packet would be out of segments or packets

> >> -      * are from different pools. */

> >> +      * are from different pools or src is a reference. */

> >>       if (odp_unlikely((dst_segs + src_segs) > CONFIG_PACKET_MAX_SEGS) ||

> >> -         odp_unlikely(dst_pool != src_pool)) {

> >> +         odp_unlikely(dst_pool != src_pool) ||

> >> +         odp_unlikely(packet_ref_count(src_hdr)) > 1) {

> >>               if (odp_packet_extend_tail(dst, src_len, NULL, NULL) >= 0) {

> >>                       (void)odp_packet_copy_from_pkt(*dst, dst_len,

> >>                                                      src, 0, src_len);

> >> @@ -1553,8 +1771,9 @@ int odp_packet_concat(odp_packet_t *dst, odp_packet_t src)

> >>

> >>       add_all_segs(dst_hdr, src_hdr);

> >>

> >> -     dst_hdr->frame_len = dst_len + src_len;

> >> -     dst_hdr->tailroom  = src_hdr->tailroom;

> >> +     dst_hdr->frame_len    = dst_len + src_len;

> >> +     dst_hdr->unshared_len = dst_len + src_len;

> >> +     dst_hdr->tailroom     = src_hdr->tailroom;

> >>

> >>       /* Data was not moved in memory */

> >>       return 0;

> >> @@ -1567,6 +1786,7 @@ int odp_packet_split(odp_packet_t *pkt, uint32_t len,

> odp_packet_t

> >> *tail)

> >>       if (len >= pktlen || tail == NULL)

> >>               return -1;

> >>

> >> +     ODP_ASSERT(odp_packet_unshared_len(*pkt) >= len);

> >>       *tail = odp_packet_copy_part(*pkt, len, pktlen - len,

> >>                                    odp_packet_pool(*pkt));

> >>

> >> @@ -1577,6 +1797,109 @@ int odp_packet_split(odp_packet_t *pkt, uint32_t len,

> odp_packet_t

> >> *tail)

> >>  }

> >>

> >>  /*

> >> + * References

> >> + */

> >> +

> >> +static inline void packet_ref(odp_packet_hdr_t *pkt_hdr)

> >> +{

> >> +     uint32_t i;

> >> +     odp_packet_hdr_t *hdr;

> >> +

> >> +     do {

> >> +             for (i = 0; i < pkt_hdr->buf_hdr.segcount; i++) {

> >> +                     hdr = pkt_hdr->buf_hdr.seg[i].hdr;

> >> +                     packet_ref_inc(hdr);

> >> +             }

> >> +

> >> +             pkt_hdr = pkt_hdr->ref_hdr;

> >> +     } while (pkt_hdr);

> >> +}

> >> +

> >> +static inline odp_packet_t packet_splice(odp_packet_hdr_t *pkt_hdr,

> >> +                                      uint32_t offset,

> >> +                                      odp_packet_hdr_t *ref_hdr)

> >> +{

> >> +     /* Catch attempted references to stale handles in debug builds */

> >> +     ODP_ASSERT(packet_ref_count(pkt_hdr) > 0);

> >> +

> >> +     /* Splicing is from the last section of src pkt */

> >> +     while (ref_hdr->ref_hdr)

> >> +             ref_hdr = ref_hdr->ref_hdr;

> >> +

> >> +     /* Find section where splice begins */

> >> +     while (offset >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {

> >> +             offset   -= (pkt_hdr->frame_len - pkt_hdr->ref_offset);

> >> +             offset   += (pkt_hdr->ref_hdr->frame_len - pkt_hdr->ref_len);

> >> +             pkt_hdr   = pkt_hdr->ref_hdr;

> >> +     }

> >> +

> >> +     ref_hdr->ref_hdr    = pkt_hdr;

> >> +     ref_hdr->ref_offset = offset;

> >> +     ref_hdr->ref_len    = pkt_hdr->frame_len;

> >> +

> >> +     if (offset < pkt_hdr->unshared_len)

> >> +             pkt_hdr->unshared_len = offset;

> >> +

> >> +     packet_ref(pkt_hdr);

> >> +     return _odp_packet_hdl(ref_hdr);

> >> +}

> >> +

> >> +odp_packet_t odp_packet_ref_static(odp_packet_t pkt)

> >> +{

> >> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >> +

> >> +     pkt_hdr->unshared_len = 0;

> >> +     packet_ref(pkt_hdr);

> >> +     return pkt;

> >> +}

> >> +

> >> +odp_packet_t odp_packet_ref(odp_packet_t pkt, uint32_t offset)

> >> +{

> >> +     odp_packet_t hdr;

> >> +     odp_packet_hdr_t *pkt_hdr;

> >> +

> >> +     if (pkt == ODP_PACKET_INVALID)

> >> +             return ODP_PACKET_INVALID;

> >> +

> >> +     pkt_hdr = odp_packet_hdr(pkt);

> >> +     if (offset >= packet_len(pkt_hdr))

> >> +             return ODP_PACKET_INVALID;

> >> +

> >> +     hdr = odp_packet_alloc(odp_packet_pool(pkt), 0);

> >> +

> >> +     if (hdr == ODP_PACKET_INVALID)

> >> +             return ODP_PACKET_INVALID;

> >> +

> >> +     return packet_splice(pkt_hdr, offset, odp_packet_hdr(hdr));

> >> +}

> >> +

> >> +odp_packet_t odp_packet_ref_pkt(odp_packet_t pkt, uint32_t offset,

> >> +                             odp_packet_t hdr)

> >> +{

> >> +     odp_packet_hdr_t *pkt_hdr;

> >> +

> >> +     if (pkt == ODP_PACKET_INVALID ||

> >> +         hdr == ODP_PACKET_INVALID ||

> >> +         pkt == hdr)

> >> +             return ODP_PACKET_INVALID;

> >> +

> >> +     ODP_ASSERT(odp_packet_has_ref(hdr) == 0);

> >> +

> >> +     pkt_hdr = odp_packet_hdr(pkt);

> >> +     if (offset >= packet_len(pkt_hdr))

> >> +             return ODP_PACKET_INVALID;

> >> +

> >> +     return packet_splice(pkt_hdr, offset, odp_packet_hdr(hdr));

> >> +}

> >> +

> >> +int odp_packet_has_ref(odp_packet_t pkt)

> >> +{

> >> +     odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >> +

> >> +     return pkt_hdr->ref_hdr != NULL || packet_ref_count(pkt_hdr) > 1;

> >> +}

> >> +

> >> +/*

> >>   *

> >>   * Copy

> >>   * ********************************************************

> >> @@ -1585,8 +1908,7 @@ int odp_packet_split(odp_packet_t *pkt, uint32_t len,

> odp_packet_t

> >> *tail)

> >>

> >>  odp_packet_t odp_packet_copy(odp_packet_t pkt, odp_pool_t pool)

> >>  {

> >> -     odp_packet_hdr_t *srchdr = odp_packet_hdr(pkt);

> >> -     uint32_t pktlen = srchdr->frame_len;

> >> +     uint32_t pktlen = odp_packet_len(pkt);

> >>       odp_packet_t newpkt = odp_packet_alloc(pool, pktlen);

> >>

> >>       if (newpkt != ODP_PACKET_INVALID) {

> >> @@ -1625,7 +1947,7 @@ int odp_packet_copy_to_mem(odp_packet_t pkt, uint32_t offset,

> >>       uint8_t *dstaddr = (uint8_t *)dst;

> >>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>

> >> -     if (offset + len > pkt_hdr->frame_len)

> >> +     if (offset + len > packet_len(pkt_hdr))

> >>               return -1;

> >>

> >>       while (len > 0) {

> >> @@ -1649,9 +1971,11 @@ int odp_packet_copy_from_mem(odp_packet_t pkt, uint32_t offset,

> >>       const uint8_t *srcaddr = (const uint8_t *)src;

> >>       odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);

> >>

> >> -     if (offset + len > pkt_hdr->frame_len)

> >> +     if (offset + len > packet_len(pkt_hdr))

> >>               return -1;

> >>

> >> +     ODP_ASSERT(odp_packet_unshared_len(pkt) >= offset + len);

> >> +

> >>       while (len > 0) {

> >>               mapaddr = packet_map(pkt_hdr, offset, &seglen, NULL);

> >>               cpylen = len > seglen ? seglen : len;

> >> @@ -1677,10 +2001,12 @@ int odp_packet_copy_from_pkt(odp_packet_t dst, uint32_t

> >> dst_offset,

> >>       uint32_t src_seglen = 0; /* GCC */

> >>       int overlap;

> >>

> >> -     if (dst_offset + len > dst_hdr->frame_len ||

> >> -         src_offset + len > src_hdr->frame_len)

> >> +     if (dst_offset + len > packet_len(dst_hdr) ||

> >> +         src_offset + len > packet_len(src_hdr))

> >>               return -1;

> >>

> >> +     ODP_ASSERT(odp_packet_unshared_len(dst) >= dst_offset + len);

> >> +

> >>       overlap = (dst_hdr == src_hdr &&

> >>                  ((dst_offset <= src_offset &&

> >>                    dst_offset + len >= src_offset) ||

> >> @@ -1764,7 +2090,7 @@ void odp_packet_print(odp_packet_t pkt)

> >>       len += snprintf(&str[len], n - len,

> >>                       "  l4_offset    %" PRIu32 "\n", hdr->p.l4_offset);

> >>       len += snprintf(&str[len], n - len,

> >> -                     "  frame_len    %" PRIu32 "\n", hdr->frame_len);

> >> +                     "  frame_len    %" PRIu32 "\n", packet_len(hdr));

> >>       len += snprintf(&str[len], n - len,

> >>                       "  input        %" PRIu64 "\n",

> >>                       odp_pktio_to_u64(hdr->input));

> >> --

> >> 2.9.3

> >
diff mbox

Patch

diff --git a/platform/linux-generic/include/odp_packet_internal.h b/platform/linux-generic/include/odp_packet_internal.h
index e6e9d74..607560d 100644
--- a/platform/linux-generic/include/odp_packet_internal.h
+++ b/platform/linux-generic/include/odp_packet_internal.h
@@ -19,6 +19,7 @@  extern "C" {
 
 #include <odp/api/align.h>
 #include <odp/api/debug.h>
+#include <odp_debug_internal.h>
 #include <odp_buffer_internal.h>
 #include <odp_pool_internal.h>
 #include <odp_buffer_inlines.h>
@@ -168,7 +169,7 @@  typedef struct {
  * packet_init(). Because of this any new fields added must be reviewed for
  * initialization requirements.
  */
-typedef struct {
+typedef struct odp_packet_hdr_t {
 	/* common buffer header */
 	odp_buffer_hdr_t buf_hdr;
 
@@ -184,6 +185,13 @@  typedef struct {
 	uint32_t headroom;
 	uint32_t tailroom;
 
+	/* Fields used to support packet references */
+	uint32_t unshared_len;
+	struct odp_packet_hdr_t *ref_hdr;
+	uint32_t ref_offset;
+	uint32_t ref_len;
+	odp_atomic_u32_t ref_count;
+
 	/*
 	 * Members below are not initialized by packet_init()
 	 */
@@ -212,6 +220,55 @@  static inline odp_packet_hdr_t *odp_packet_hdr(odp_packet_t pkt)
 	return (odp_packet_hdr_t *)buf_hdl_to_hdr((odp_buffer_t)pkt);
 }
 
+static inline odp_packet_hdr_t *odp_packet_last_hdr(odp_packet_t pkt,
+						    uint32_t *offset)
+{
+	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
+	odp_packet_hdr_t *prev_hdr = pkt_hdr;
+	uint32_t ref_offset = 0;
+
+	while (pkt_hdr->ref_hdr) {
+		ref_offset = pkt_hdr->ref_offset;
+		prev_hdr   = pkt_hdr;
+		pkt_hdr    = pkt_hdr->ref_hdr;
+	}
+
+	if (offset) {
+		if (prev_hdr != pkt_hdr)
+			ref_offset += pkt_hdr->frame_len - prev_hdr->ref_len;
+		*offset = ref_offset;
+	}
+
+	return pkt_hdr;
+}
+
+static inline odp_packet_hdr_t *odp_packet_prev_hdr(odp_packet_hdr_t *pkt_hdr,
+						    odp_packet_hdr_t *cur_hdr,
+						    uint32_t *offset)
+{
+	uint32_t ref_offset = 0;
+	odp_packet_hdr_t *prev_hdr = pkt_hdr;
+
+	while (pkt_hdr->ref_hdr != cur_hdr) {
+		ref_offset = pkt_hdr->ref_offset;
+		prev_hdr   = pkt_hdr;
+		pkt_hdr    = pkt_hdr->ref_hdr;
+	}
+
+	if (offset) {
+		if (prev_hdr != pkt_hdr)
+			ref_offset += pkt_hdr->frame_len - prev_hdr->ref_len;
+		*offset = ref_offset;
+	}
+
+	return pkt_hdr;
+}
+
+static inline odp_packet_t _odp_packet_hdl(odp_packet_hdr_t *pkt_hdr)
+{
+	return (odp_packet_t)odp_hdr_to_buf(&pkt_hdr->buf_hdr);
+}
+
 static inline void copy_packet_parser_metadata(odp_packet_hdr_t *src_hdr,
 					       odp_packet_hdr_t *dst_hdr)
 {
@@ -234,17 +291,43 @@  static inline void pull_tail(odp_packet_hdr_t *pkt_hdr, uint32_t len)
 
 	pkt_hdr->tailroom  += len;
 	pkt_hdr->frame_len -= len;
+	pkt_hdr->unshared_len -= len;
 	pkt_hdr->buf_hdr.seg[last].len -= len;
 }
 
 static inline uint32_t packet_len(odp_packet_hdr_t *pkt_hdr)
 {
-	return pkt_hdr->frame_len;
+	uint32_t pkt_len = 0;
+	uint32_t offset  = 0;
+
+	do {
+		pkt_len += pkt_hdr->frame_len - offset;
+		offset   = pkt_hdr->ref_offset;
+		if (pkt_hdr->ref_hdr)
+			offset += (pkt_hdr->ref_hdr->frame_len -
+				   pkt_hdr->ref_len);
+		pkt_hdr  = pkt_hdr->ref_hdr;
+	} while (pkt_hdr);
+
+	return pkt_len;
+}
+
+static inline uint32_t packet_ref_count(odp_packet_hdr_t *pkt_hdr)
+{
+	return odp_atomic_load_u32(&pkt_hdr->ref_count);
+}
+
+static inline void packet_ref_count_set(odp_packet_hdr_t *pkt_hdr, uint32_t n)
+{
+	odp_atomic_init_u32(&pkt_hdr->ref_count, n);
 }
 
 static inline void packet_set_len(odp_packet_hdr_t *pkt_hdr, uint32_t len)
 {
+	ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);
+
 	pkt_hdr->frame_len = len;
+	pkt_hdr->unshared_len = len;
 }
 
 static inline int packet_parse_l2_not_done(packet_parser_t *prs)
diff --git a/platform/linux-generic/odp_packet.c b/platform/linux-generic/odp_packet.c
index f632a51..170965a 100644
--- a/platform/linux-generic/odp_packet.c
+++ b/platform/linux-generic/odp_packet.c
@@ -33,13 +33,24 @@  static inline odp_buffer_t buffer_handle(odp_packet_hdr_t *pkt_hdr)
 	return pkt_hdr->buf_hdr.handle.handle;
 }
 
+static inline uint32_t packet_ref_inc(odp_packet_hdr_t *pkt_hdr)
+{
+	return odp_atomic_fetch_inc_u32(&pkt_hdr->ref_count);
+}
+
+static inline uint32_t packet_ref_dec(odp_packet_hdr_t *pkt_hdr)
+{
+	return odp_atomic_fetch_dec_u32(&pkt_hdr->ref_count);
+}
+
 static inline uint32_t packet_seg_len(odp_packet_hdr_t *pkt_hdr,
 				      uint32_t seg_idx)
 {
 	return pkt_hdr->buf_hdr.seg[seg_idx].len;
 }
 
-static inline void *packet_seg_data(odp_packet_hdr_t *pkt_hdr, uint32_t seg_idx)
+static inline uint8_t *packet_seg_data(odp_packet_hdr_t *pkt_hdr,
+				       uint32_t seg_idx)
 {
 	return pkt_hdr->buf_hdr.seg[seg_idx].data;
 }
@@ -52,6 +63,11 @@  static inline int packet_last_seg(odp_packet_hdr_t *pkt_hdr)
 		return pkt_hdr->buf_hdr.segcount - 1;
 }
 
+static inline void *packet_data(odp_packet_hdr_t *pkt_hdr)
+{
+	return pkt_hdr->buf_hdr.seg[0].data;
+}
+
 static inline uint32_t packet_first_seg_len(odp_packet_hdr_t *pkt_hdr)
 {
 	return packet_seg_len(pkt_hdr, 0);
@@ -64,11 +80,6 @@  static inline uint32_t packet_last_seg_len(odp_packet_hdr_t *pkt_hdr)
 	return packet_seg_len(pkt_hdr, last);
 }
 
-static inline void *packet_data(odp_packet_hdr_t *pkt_hdr)
-{
-	return pkt_hdr->buf_hdr.seg[0].data;
-}
-
 static inline void *packet_tail(odp_packet_hdr_t *pkt_hdr)
 {
 	int last = packet_last_seg(pkt_hdr);
@@ -99,6 +110,7 @@  static inline void push_head(odp_packet_hdr_t *pkt_hdr, uint32_t len)
 {
 	pkt_hdr->headroom  -= len;
 	pkt_hdr->frame_len += len;
+	pkt_hdr->unshared_len += len;
 	pkt_hdr->buf_hdr.seg[0].data -= len;
 	pkt_hdr->buf_hdr.seg[0].len  += len;
 }
@@ -107,6 +119,7 @@  static inline void pull_head(odp_packet_hdr_t *pkt_hdr, uint32_t len)
 {
 	pkt_hdr->headroom  += len;
 	pkt_hdr->frame_len -= len;
+	pkt_hdr->unshared_len -= len;
 	pkt_hdr->buf_hdr.seg[0].data += len;
 	pkt_hdr->buf_hdr.seg[0].len  -= len;
 }
@@ -117,6 +130,7 @@  static inline void push_tail(odp_packet_hdr_t *pkt_hdr, uint32_t len)
 
 	pkt_hdr->tailroom  -= len;
 	pkt_hdr->frame_len += len;
+	pkt_hdr->unshared_len += len;
 	pkt_hdr->buf_hdr.seg[last].len += len;
 }
 
@@ -144,6 +158,10 @@  static inline void packet_seg_copy_md(odp_packet_hdr_t *dst,
 	dst->buf_hdr.uarea_addr = src->buf_hdr.uarea_addr;
 	dst->buf_hdr.uarea_size = src->buf_hdr.uarea_size;
 
+	/* reference related metadata */
+	dst->ref_len      = src->ref_len;
+	dst->unshared_len = src->unshared_len;
+
 	/* segmentation data is not copied:
 	 *   buf_hdr.seg[]
 	 *   buf_hdr.segcount
@@ -158,7 +176,15 @@  static inline void *packet_map(odp_packet_hdr_t *pkt_hdr,
 	int seg = 0;
 	int seg_count = pkt_hdr->buf_hdr.segcount;
 
-	if (odp_unlikely(offset >= pkt_hdr->frame_len))
+	/* Special processing for references */
+	while (offset >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {
+		offset   -= (pkt_hdr->frame_len - pkt_hdr->ref_offset);
+		offset   += (pkt_hdr->ref_hdr->frame_len - pkt_hdr->ref_len);
+		pkt_hdr   = pkt_hdr->ref_hdr;
+		seg_count = pkt_hdr->buf_hdr.segcount;
+	}
+
+	if (odp_unlikely(offset > pkt_hdr->frame_len))
 		return NULL;
 
 	if (odp_likely(CONFIG_PACKET_MAX_SEGS == 1 || seg_count == 1)) {
@@ -207,6 +233,9 @@  void packet_parse_reset(odp_packet_hdr_t *pkt_hdr)
 	pkt_hdr->p.l2_offset        = 0;
 	pkt_hdr->p.l3_offset        = ODP_PACKET_OFFSET_INVALID;
 	pkt_hdr->p.l4_offset        = ODP_PACKET_OFFSET_INVALID;
+
+	/* Ensure dummy pkt_hdrs used in I/O recv classification are valid */
+	pkt_hdr->ref_hdr = NULL;
 }
 
 /**
@@ -252,6 +281,10 @@  static inline void packet_init(odp_packet_hdr_t *pkt_hdr, uint32_t len,
 			     CONFIG_PACKET_TAILROOM;
 
 	pkt_hdr->input = ODP_PKTIO_INVALID;
+
+	/* By default packet has no references */
+	pkt_hdr->unshared_len = len;
+	pkt_hdr->ref_hdr = NULL;
 }
 
 static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int num)
@@ -264,6 +297,7 @@  static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int num)
 
 	hdr->buf_hdr.seg[0].data = hdr->buf_hdr.base_data;
 	hdr->buf_hdr.seg[0].len  = BASE_LEN;
+	packet_ref_count_set(hdr, 1);
 
 	/* Link segments */
 	if (CONFIG_PACKET_MAX_SEGS != 1) {
@@ -273,6 +307,7 @@  static inline void init_segments(odp_packet_hdr_t *pkt_hdr[], int num)
 			for (i = 1; i < num; i++) {
 				odp_buffer_hdr_t *buf_hdr;
 
+				packet_ref_count_set(pkt_hdr[i], 1);
 				buf_hdr = &pkt_hdr[i]->buf_hdr;
 				hdr->buf_hdr.seg[i].hdr  = buf_hdr;
 				hdr->buf_hdr.seg[i].data = buf_hdr->base_data;
@@ -376,9 +411,10 @@  static inline odp_packet_hdr_t *add_segments(odp_packet_hdr_t *pkt_hdr,
 		new_hdr->buf_hdr.seg[0].len   = seg_len;
 
 		packet_seg_copy_md(new_hdr, pkt_hdr);
-		new_hdr->frame_len = pkt_hdr->frame_len + len;
-		new_hdr->headroom  = pool->headroom + offset;
-		new_hdr->tailroom  = pkt_hdr->tailroom;
+		new_hdr->frame_len    = pkt_hdr->frame_len + len;
+		new_hdr->unshared_len = pkt_hdr->unshared_len + len;
+		new_hdr->headroom     = pool->headroom + offset;
+		new_hdr->tailroom     = pkt_hdr->tailroom;
 
 		pkt_hdr = new_hdr;
 	} else {
@@ -391,8 +427,9 @@  static inline odp_packet_hdr_t *add_segments(odp_packet_hdr_t *pkt_hdr,
 		last = packet_last_seg(pkt_hdr);
 		pkt_hdr->buf_hdr.seg[last].len = seg_len;
 
-		pkt_hdr->frame_len += len;
-		pkt_hdr->tailroom   = pool->tailroom + offset;
+		pkt_hdr->frame_len    += len;
+		pkt_hdr->unshared_len += len;
+		pkt_hdr->tailroom      = pool->tailroom + offset;
 	}
 
 	return pkt_hdr;
@@ -400,13 +437,18 @@  static inline odp_packet_hdr_t *add_segments(odp_packet_hdr_t *pkt_hdr,
 
 static inline void free_bufs(odp_packet_hdr_t *pkt_hdr, int first, int num)
 {
-	int i;
+	int i, nfree;
 	odp_buffer_t buf[num];
 
-	for (i = 0; i < num; i++)
-		buf[i] = buffer_handle(pkt_hdr->buf_hdr.seg[first + i].hdr);
+	for (i = 0, nfree = 0; i < num; i++) {
+		odp_packet_hdr_t *hdr = pkt_hdr->buf_hdr.seg[first + i].hdr;
+
+		if (packet_ref_dec(hdr) == 1)
+			buf[nfree++] = buffer_handle(hdr);
+	}
 
-	buffer_free_multi(buf, num);
+	if (nfree > 0)
+		buffer_free_multi(buf, nfree);
 }
 
 static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t *pkt_hdr,
@@ -417,11 +459,15 @@  static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t *pkt_hdr,
 
 	if (head) {
 		odp_packet_hdr_t *new_hdr;
-		int i;
+		int i, nfree;
 		odp_buffer_t buf[num];
 
-		for (i = 0; i < num; i++)
-			buf[i] = buffer_handle(pkt_hdr->buf_hdr.seg[i].hdr);
+		for (i = 0, nfree = 0; i < num; i++) {
+			new_hdr = pkt_hdr->buf_hdr.seg[i].hdr;
+
+			if (packet_ref_dec(new_hdr) == 1)
+				buf[nfree++] = buffer_handle(new_hdr);
+		}
 
 		/* First remaining segment is the new packet descriptor */
 		new_hdr = pkt_hdr->buf_hdr.seg[num].hdr;
@@ -430,15 +476,17 @@  static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t *pkt_hdr,
 		packet_seg_copy_md(new_hdr, pkt_hdr);
 
 		/* Tailroom not changed */
-		new_hdr->tailroom  = pkt_hdr->tailroom;
-		new_hdr->headroom  = seg_headroom(new_hdr, 0);
-		new_hdr->frame_len = pkt_hdr->frame_len - free_len;
+		new_hdr->tailroom     = pkt_hdr->tailroom;
+		new_hdr->headroom     = seg_headroom(new_hdr, 0);
+		new_hdr->frame_len    = pkt_hdr->frame_len - free_len;
+		new_hdr->unshared_len = pkt_hdr->unshared_len - free_len;
 
 		pull_head(new_hdr, pull_len);
 
 		pkt_hdr = new_hdr;
 
-		buffer_free_multi(buf, num);
+		if (nfree > 0)
+			buffer_free_multi(buf, nfree);
 	} else {
 		/* Free last 'num' bufs */
 		free_bufs(pkt_hdr, num_remain, num);
@@ -447,6 +495,7 @@  static inline odp_packet_hdr_t *free_segments(odp_packet_hdr_t *pkt_hdr,
 		 * of the metadata. */
 		pkt_hdr->buf_hdr.segcount = num_remain;
 		pkt_hdr->frame_len -= free_len;
+		pkt_hdr->unshared_len -= free_len;
 		pkt_hdr->tailroom = seg_tailroom(pkt_hdr, num_remain - 1);
 
 		pull_tail(pkt_hdr, pull_len);
@@ -550,45 +599,34 @@  int odp_packet_alloc_multi(odp_pool_t pool_hdl, uint32_t len,
 	return num;
 }
 
-void odp_packet_free(odp_packet_t pkt)
+static inline void packet_free(odp_packet_hdr_t *pkt_hdr)
 {
-	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
-	int num_seg = pkt_hdr->buf_hdr.segcount;
+	odp_packet_hdr_t *ref_hdr;
+	uint32_t ref_count;
 
-	if (odp_likely(CONFIG_PACKET_MAX_SEGS == 1 || num_seg == 1))
-		buffer_free_multi((odp_buffer_t *)&pkt, 1);
-	else
-		free_bufs(pkt_hdr, 0, num_seg);
-}
+	do {
+		ref_hdr = pkt_hdr->ref_hdr;
+		ref_count = packet_ref_count(pkt_hdr) - 1;
+		free_bufs(pkt_hdr, 0, pkt_hdr->buf_hdr.segcount);
 
-void odp_packet_free_multi(const odp_packet_t pkt[], int num)
-{
-	if (CONFIG_PACKET_MAX_SEGS == 1) {
-		buffer_free_multi((const odp_buffer_t * const)pkt, num);
-	} else {
-		odp_buffer_t buf[num * CONFIG_PACKET_MAX_SEGS];
-		int i, j;
-		int bufs = 0;
+		if (ref_count == 1)
+			pkt_hdr->unshared_len = pkt_hdr->frame_len;
 
-		for (i = 0; i < num; i++) {
-			odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt[i]);
-			int num_seg = pkt_hdr->buf_hdr.segcount;
-			odp_buffer_hdr_t *buf_hdr = &pkt_hdr->buf_hdr;
-
-			buf[bufs] = (odp_buffer_t)pkt[i];
-			bufs++;
+		pkt_hdr = ref_hdr;
+	} while (pkt_hdr);
+}
 
-			if (odp_likely(num_seg == 1))
-				continue;
+void odp_packet_free(odp_packet_t pkt)
+{
+	packet_free(odp_packet_hdr(pkt));
+}
 
-			for (j = 1; j < num_seg; j++) {
-				buf[bufs] = buffer_handle(buf_hdr->seg[j].hdr);
-				bufs++;
-			}
-		}
+void odp_packet_free_multi(const odp_packet_t pkt[], int num)
+{
+	int i;
 
-		buffer_free_multi(buf, bufs);
-	}
+	for (i = 0; i < num; i++)
+		packet_free(odp_packet_hdr(pkt[i]));
 }
 
 int odp_packet_reset(odp_packet_t pkt, uint32_t len)
@@ -599,6 +637,9 @@  int odp_packet_reset(odp_packet_t pkt, uint32_t len)
 	if (len > pool->headroom + pool->data_size + pool->tailroom)
 		return -1;
 
+	if (pkt_hdr->ref_hdr)
+		packet_free(pkt_hdr->ref_hdr);
+
 	packet_init(pkt_hdr, len, 0);
 
 	return 0;
@@ -641,15 +682,21 @@  void *odp_packet_head(odp_packet_t pkt)
 uint32_t odp_packet_buf_len(odp_packet_t pkt)
 {
 	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
+	uint32_t buf_len = 0;
 
-	return pkt_hdr->buf_hdr.size * pkt_hdr->buf_hdr.segcount;
+	do {
+		buf_len += pkt_hdr->buf_hdr.size * pkt_hdr->buf_hdr.segcount;
+		pkt_hdr  = pkt_hdr->ref_hdr;
+	} while (pkt_hdr);
+
+	return buf_len;
 }
 
 void *odp_packet_data(odp_packet_t pkt)
 {
 	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
 
-	return packet_data(pkt_hdr);
+	return packet_map(pkt_hdr, 0, NULL, NULL);
 }
 
 uint32_t odp_packet_seg_len(odp_packet_t pkt)
@@ -661,7 +708,32 @@  uint32_t odp_packet_seg_len(odp_packet_t pkt)
 
 uint32_t odp_packet_len(odp_packet_t pkt)
 {
-	return odp_packet_hdr(pkt)->frame_len;
+	return packet_len(odp_packet_hdr(pkt));
+}
+
+uint32_t odp_packet_unshared_len(odp_packet_t pkt)
+{
+	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
+	uint32_t pkt_len = 0, offset = 0;
+
+	do {
+		if (packet_ref_count(pkt_hdr) > 1) {
+			if (offset == 0)
+				pkt_len += pkt_hdr->unshared_len;
+			break;
+		}
+
+		pkt_len += pkt_hdr->frame_len - offset;
+		offset   = pkt_hdr->ref_offset;
+
+		if (pkt_hdr->ref_hdr)
+			offset += (pkt_hdr->ref_hdr->frame_len -
+				   pkt_hdr->ref_len);
+
+		pkt_hdr = pkt_hdr->ref_hdr;
+	} while (pkt_hdr);
+
+	return pkt_len;
 }
 
 uint32_t odp_packet_headroom(odp_packet_t pkt)
@@ -671,12 +743,12 @@  uint32_t odp_packet_headroom(odp_packet_t pkt)
 
 uint32_t odp_packet_tailroom(odp_packet_t pkt)
 {
-	return odp_packet_hdr(pkt)->tailroom;
+	return odp_packet_last_hdr(pkt, NULL)->tailroom;
 }
 
 void *odp_packet_tail(odp_packet_t pkt)
 {
-	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
+	odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(pkt, NULL);
 
 	return packet_tail(pkt_hdr);
 }
@@ -870,7 +942,7 @@  int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,
 {
 	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);
 	uint32_t frame_len = pkt_hdr->frame_len;
-	uint32_t headroom  = pkt_hdr->headroom;
+	uint32_t headroom = pkt_hdr->headroom;
 	int ret = 0;
 
 	if (len > headroom) {
@@ -885,6 +957,46 @@  int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,
 		segs = pkt_hdr->buf_hdr.segcount;
 
 		if (odp_unlikely((segs + num) > CONFIG_PACKET_MAX_SEGS)) {
+			/* Handle recursively via references when
+			 * working with referenced packets since another
+			 * thread may be accessing it concurrently via
+			 * its reference to it. */
+			if (packet_ref_count(pkt_hdr) > 1) {
+				odp_packet_t ref;
+				uint32_t unshared_len;
+
+				push_head(pkt_hdr, headroom);
+				unshared_len = pkt_hdr->unshared_len;
+				ref = odp_packet_ref(*pkt, 0);
+
+				if (ref == ODP_PACKET_INVALID) {
+					pull_head(pkt_hdr, headroom);
+					return -1;
+				}
+
+				ret = odp_packet_extend_head(&ref,
+							     len - headroom,
+							     data_ptr,
+							     seg_len);
+
+				if (ret < 0) {
+					odp_packet_free(ref);
+					pull_head(pkt_hdr, headroom);
+					return -1;
+				}
+
+				/* Since this is a special ref, the
+				 * base pkt's unshared len is unchanged */
+				pkt_hdr->unshared_len = unshared_len;
+
+				/* Remove extra ref to the base pkt */
+				odp_packet_free(*pkt);
+
+				/* Return the ref as the extension result */
+				*pkt = ref;
+				return 1;
+			}
+
 			/* Cannot directly add new segments */
 			odp_packet_hdr_t *new_hdr;
 			int new_segs = 0;
@@ -936,6 +1048,7 @@  int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,
 
 			pkt_hdr->buf_hdr.segcount = segs;
 			pkt_hdr->frame_len        = frame_len;
+			pkt_hdr->unshared_len     = frame_len;
 			pkt_hdr->headroom         = offset + pool->headroom;
 			pkt_hdr->tailroom         = pool->tailroom;
 
@@ -961,11 +1074,16 @@  int odp_packet_extend_head(odp_packet_t *pkt, uint32_t len,
 		push_head(pkt_hdr, len);
 	}
 
-	if (data_ptr)
-		*data_ptr = packet_data(pkt_hdr);
+	if (data_ptr || seg_len) {
+		uint32_t seg_ln = 0;
+		void *data = packet_map(pkt_hdr, 0, &seg_ln, NULL);
 
-	if (seg_len)
-		*seg_len = packet_first_seg_len(pkt_hdr);
+		if (data_ptr)
+			*data_ptr = data;
+
+		if (seg_len)
+			*seg_len = seg_ln;
+	}
 
 	return ret;
 }
@@ -977,6 +1095,8 @@  void *odp_packet_pull_head(odp_packet_t pkt, uint32_t len)
 	if (len > pkt_hdr->frame_len)
 		return NULL;
 
+	ODP_ASSERT(len <= pkt_hdr->unshared_len);
+
 	pull_head(pkt_hdr, len);
 	return packet_data(pkt_hdr);
 }
@@ -984,15 +1104,35 @@  void *odp_packet_pull_head(odp_packet_t pkt, uint32_t len)
 int odp_packet_trunc_head(odp_packet_t *pkt, uint32_t len,
 			  void **data_ptr, uint32_t *seg_len_out)
 {
-	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);
+	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt), *nxt_hdr;
 	uint32_t seg_len = packet_first_seg_len(pkt_hdr);
+	int ret = 0;
 
-	if (len > pkt_hdr->frame_len)
+	if (len > packet_len(pkt_hdr))
 		return -1;
 
-	if (len < seg_len) {
+	ODP_ASSERT(len <= odp_packet_unshared_len(*pkt));
+
+	/* Special processing for references */
+	while (len >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {
+		ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);
+		nxt_hdr = pkt_hdr->ref_hdr;
+		len -= pkt_hdr->frame_len;
+		len += pkt_hdr->ref_offset +
+			(nxt_hdr->frame_len - pkt_hdr->ref_len);
+		pkt_hdr->ref_hdr = NULL;
+		packet_free(pkt_hdr);
+		pkt_hdr = nxt_hdr;
+		seg_len = packet_first_seg_len(pkt_hdr);
+		*pkt = packet_handle(pkt_hdr);
+		ret = 1;
+	}
+
+	if (CONFIG_PACKET_MAX_SEGS == 1 ||
+	    len < seg_len ||
+	    pkt_hdr->buf_hdr.segcount == 1) {
 		pull_head(pkt_hdr, len);
-	} else if (CONFIG_PACKET_MAX_SEGS != 1) {
+	} else {
 		int num = 0;
 		uint32_t pull_len = 0;
 
@@ -1007,23 +1147,29 @@  int odp_packet_trunc_head(odp_packet_t *pkt, uint32_t len,
 		*pkt    = packet_handle(pkt_hdr);
 	}
 
-	if (data_ptr)
-		*data_ptr = packet_data(pkt_hdr);
+	if (data_ptr || seg_len_out) {
+		void *data_head = packet_map(pkt_hdr, 0, &seg_len, NULL);
 
-	if (seg_len_out)
-		*seg_len_out = packet_first_seg_len(pkt_hdr);
+		if (data_ptr)
+			*data_ptr = data_head;
 
-	return 0;
+		if (seg_len_out)
+			*seg_len_out = seg_len;
+	}
+
+	return ret;
 }
 
 void *odp_packet_push_tail(odp_packet_t pkt, uint32_t len)
 {
-	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
+	odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(pkt, NULL);
 	void *old_tail;
 
 	if (len > pkt_hdr->tailroom)
 		return NULL;
 
+	ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);
+
 	old_tail = packet_tail(pkt_hdr);
 	push_tail(pkt_hdr, len);
 
@@ -1033,12 +1179,14 @@  void *odp_packet_push_tail(odp_packet_t pkt, uint32_t len)
 int odp_packet_extend_tail(odp_packet_t *pkt, uint32_t len,
 			   void **data_ptr, uint32_t *seg_len_out)
 {
-	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);
+	odp_packet_hdr_t *pkt_hdr = odp_packet_last_hdr(*pkt, NULL);
 	uint32_t frame_len = pkt_hdr->frame_len;
 	uint32_t tailroom  = pkt_hdr->tailroom;
 	uint32_t tail_off  = frame_len;
 	int ret = 0;
 
+	ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);
+
 	if (len > tailroom) {
 		pool_t *pool = pool_entry_from_hdl(pkt_hdr->buf_hdr.pool_hdl);
 		int num;
@@ -1129,6 +1277,7 @@  void *odp_packet_pull_tail(odp_packet_t pkt, uint32_t len)
 	if (len > packet_last_seg_len(pkt_hdr))
 		return NULL;
 
+	ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);
 	pull_tail(pkt_hdr, len);
 
 	return packet_tail(pkt_hdr);
@@ -1139,17 +1288,34 @@  int odp_packet_trunc_tail(odp_packet_t *pkt, uint32_t len,
 {
 	int last;
 	uint32_t seg_len;
-	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(*pkt);
+	uint32_t offset;
+	odp_packet_hdr_t *first_hdr = odp_packet_hdr(*pkt);
+	odp_packet_hdr_t *pkt_hdr, *prev_hdr;
 
-	if (len > pkt_hdr->frame_len)
+	if (len > packet_len(first_hdr))
 		return -1;
 
+	pkt_hdr = odp_packet_last_hdr(*pkt, &offset);
+
+	/* Special processing for references */
+	while (len >= pkt_hdr->frame_len - offset && first_hdr->ref_hdr) {
+		len -= (pkt_hdr->frame_len - offset);
+		prev_hdr = odp_packet_prev_hdr(first_hdr, pkt_hdr, &offset);
+		ODP_ASSERT(packet_ref_count(prev_hdr) == 1);
+		prev_hdr->ref_hdr = NULL;
+		packet_free(pkt_hdr);
+		pkt_hdr = prev_hdr;
+	}
+
+	ODP_ASSERT(packet_ref_count(pkt_hdr) == 1);
 	last    = packet_last_seg(pkt_hdr);
 	seg_len = packet_seg_len(pkt_hdr, last);
 
-	if (len < seg_len) {
+	if (CONFIG_PACKET_MAX_SEGS == 1 ||
+	    len < seg_len ||
+	    pkt_hdr->buf_hdr.segcount == 1) {
 		pull_tail(pkt_hdr, len);
-	} else if (CONFIG_PACKET_MAX_SEGS != 1) {
+	} else {
 		int num = 0;
 		uint32_t pull_len = 0;
 
@@ -1356,35 +1522,50 @@  void odp_packet_ts_set(odp_packet_t pkt, odp_time_t timestamp)
 
 int odp_packet_is_segmented(odp_packet_t pkt)
 {
-	return odp_packet_hdr(pkt)->buf_hdr.segcount > 1;
+	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
+
+	return pkt_hdr->buf_hdr.segcount > 1 || pkt_hdr->ref_hdr != NULL;
 }
 
 int odp_packet_num_segs(odp_packet_t pkt)
 {
 	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
+	uint32_t segcount = 0, i;
+	uint32_t seg_offset = 0, offset;
+
+	do {
+		segcount += pkt_hdr->buf_hdr.segcount - seg_offset;
+		offset    = pkt_hdr->ref_offset;
+		pkt_hdr   = pkt_hdr->ref_hdr;
+		if (pkt_hdr) {
+			for (i = 0, seg_offset = 0;
+			     i < pkt_hdr->buf_hdr.segcount;
+			     i++, seg_offset++) {
+				if (offset < pkt_hdr->buf_hdr.seg[i].len)
+					break;
+				offset -= pkt_hdr->buf_hdr.seg[i].len;
+			}
+		}
+	} while (pkt_hdr);
 
-	return pkt_hdr->buf_hdr.segcount;
+	return segcount;
 }
 
-odp_packet_seg_t odp_packet_first_seg(odp_packet_t pkt)
+odp_packet_seg_t odp_packet_first_seg(odp_packet_t pkt ODP_UNUSED)
 {
-	(void)pkt;
-
 	return 0;
 }
 
 odp_packet_seg_t odp_packet_last_seg(odp_packet_t pkt)
 {
-	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
-
-	return packet_last_seg(pkt_hdr);
+	return (odp_packet_seg_t)(odp_packet_num_segs(pkt) - 1);
 }
 
 odp_packet_seg_t odp_packet_next_seg(odp_packet_t pkt, odp_packet_seg_t seg)
 {
 	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
 
-	if (odp_unlikely(seg >= (odp_packet_seg_t)packet_last_seg(pkt_hdr)))
+	if (odp_unlikely(seg >= packet_last_seg(pkt_hdr)))
 		return ODP_PACKET_SEG_INVALID;
 
 	return seg + 1;
@@ -1400,21 +1581,51 @@  odp_packet_seg_t odp_packet_next_seg(odp_packet_t pkt, odp_packet_seg_t seg)
 void *odp_packet_seg_data(odp_packet_t pkt, odp_packet_seg_t seg)
 {
 	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
+	uint32_t seg_offset = 0, offset = 0, i;
+
+	while (seg >= pkt_hdr->buf_hdr.segcount - seg_offset &&
+	       pkt_hdr->ref_hdr) {
+		seg    -= (pkt_hdr->buf_hdr.segcount - seg_offset);
+		offset  = pkt_hdr->ref_offset;
+		pkt_hdr = pkt_hdr->ref_hdr;
+		for (i = 0, seg_offset = 0;
+		     i < pkt_hdr->buf_hdr.segcount;
+		     i++, seg_offset++) {
+			if (offset < pkt_hdr->buf_hdr.seg[i].len)
+				break;
+			offset -= pkt_hdr->buf_hdr.seg[i].len;
+		}
+	}
 
-	if (odp_unlikely(seg >= pkt_hdr->buf_hdr.segcount))
+	if (odp_unlikely(seg + seg_offset >= pkt_hdr->buf_hdr.segcount))
 		return NULL;
 
-	return packet_seg_data(pkt_hdr, seg);
+	return packet_seg_data(pkt_hdr, seg + seg_offset) + offset;
 }
 
 uint32_t odp_packet_seg_data_len(odp_packet_t pkt, odp_packet_seg_t seg)
 {
 	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
+	uint32_t seg_offset = 0, offset = 0, i;
+
+	while (seg >= pkt_hdr->buf_hdr.segcount - seg_offset &&
+	       pkt_hdr->ref_hdr) {
+		seg    -= (pkt_hdr->buf_hdr.segcount - seg_offset);
+		offset  = pkt_hdr->ref_offset;
+		pkt_hdr = pkt_hdr->ref_hdr;
+		for (i = 0, seg_offset = 0;
+		     i < pkt_hdr->buf_hdr.segcount;
+		     i++, seg_offset++) {
+			if (offset < pkt_hdr->buf_hdr.seg[i].len)
+				break;
+			offset -= pkt_hdr->buf_hdr.seg[i].len;
+		}
+	}
 
-	if (odp_unlikely(seg >= pkt_hdr->buf_hdr.segcount))
+	if (odp_unlikely(seg + seg_offset >= pkt_hdr->buf_hdr.segcount))
 		return 0;
 
-	return packet_seg_len(pkt_hdr, seg);
+	return packet_seg_len(pkt_hdr, seg + seg_offset) - offset;
 }
 
 /*
@@ -1428,12 +1639,14 @@  int odp_packet_add_data(odp_packet_t *pkt_ptr, uint32_t offset, uint32_t len)
 {
 	odp_packet_t pkt = *pkt_ptr;
 	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
-	uint32_t pktlen = pkt_hdr->frame_len;
+	uint32_t pktlen = packet_len(pkt_hdr);
 	odp_packet_t newpkt;
 
 	if (offset > pktlen)
 		return -1;
 
+	ODP_ASSERT(odp_packet_unshared_len(*pkt_ptr) >= offset);
+
 	newpkt = odp_packet_alloc(pkt_hdr->buf_hdr.pool_hdl, pktlen + len);
 
 	if (newpkt == ODP_PACKET_INVALID)
@@ -1496,6 +1709,8 @@  int odp_packet_align(odp_packet_t *pkt, uint32_t offset, uint32_t len,
 	if (align > ODP_CACHE_LINE_SIZE)
 		return -1;
 
+	ODP_ASSERT(odp_packet_has_ref(*pkt) == 0);
+
 	if (seglen >= len) {
 		misalign = align <= 1 ? 0 :
 			ODP_ALIGN_ROUNDUP(uaddr, align) - uaddr;
@@ -1535,10 +1750,13 @@  int odp_packet_concat(odp_packet_t *dst, odp_packet_t src)
 	uint32_t dst_len    = dst_hdr->frame_len;
 	uint32_t src_len    = src_hdr->frame_len;
 
+	ODP_ASSERT(packet_ref_count(dst_hdr) == 1);
+
 	/* Do a copy if resulting packet would be out of segments or packets
-	 * are from different pools. */
+	 * are from different pools or src is a reference. */
 	if (odp_unlikely((dst_segs + src_segs) > CONFIG_PACKET_MAX_SEGS) ||
-	    odp_unlikely(dst_pool != src_pool)) {
+	    odp_unlikely(dst_pool != src_pool) ||
+	    odp_unlikely(packet_ref_count(src_hdr)) > 1) {
 		if (odp_packet_extend_tail(dst, src_len, NULL, NULL) >= 0) {
 			(void)odp_packet_copy_from_pkt(*dst, dst_len,
 						       src, 0, src_len);
@@ -1553,8 +1771,9 @@  int odp_packet_concat(odp_packet_t *dst, odp_packet_t src)
 
 	add_all_segs(dst_hdr, src_hdr);
 
-	dst_hdr->frame_len = dst_len + src_len;
-	dst_hdr->tailroom  = src_hdr->tailroom;
+	dst_hdr->frame_len    = dst_len + src_len;
+	dst_hdr->unshared_len = dst_len + src_len;
+	dst_hdr->tailroom     = src_hdr->tailroom;
 
 	/* Data was not moved in memory */
 	return 0;
@@ -1567,6 +1786,7 @@  int odp_packet_split(odp_packet_t *pkt, uint32_t len, odp_packet_t *tail)
 	if (len >= pktlen || tail == NULL)
 		return -1;
 
+	ODP_ASSERT(odp_packet_unshared_len(*pkt) >= len);
 	*tail = odp_packet_copy_part(*pkt, len, pktlen - len,
 				     odp_packet_pool(*pkt));
 
@@ -1577,6 +1797,109 @@  int odp_packet_split(odp_packet_t *pkt, uint32_t len, odp_packet_t *tail)
 }
 
 /*
+ * References
+ */
+
+static inline void packet_ref(odp_packet_hdr_t *pkt_hdr)
+{
+	uint32_t i;
+	odp_packet_hdr_t *hdr;
+
+	do {
+		for (i = 0; i < pkt_hdr->buf_hdr.segcount; i++) {
+			hdr = pkt_hdr->buf_hdr.seg[i].hdr;
+			packet_ref_inc(hdr);
+		}
+
+		pkt_hdr = pkt_hdr->ref_hdr;
+	} while (pkt_hdr);
+}
+
+static inline odp_packet_t packet_splice(odp_packet_hdr_t *pkt_hdr,
+					 uint32_t offset,
+					 odp_packet_hdr_t *ref_hdr)
+{
+	/* Catch attempted references to stale handles in debug builds */
+	ODP_ASSERT(packet_ref_count(pkt_hdr) > 0);
+
+	/* Splicing is from the last section of src pkt */
+	while (ref_hdr->ref_hdr)
+		ref_hdr = ref_hdr->ref_hdr;
+
+	/* Find section where splice begins */
+	while (offset >= pkt_hdr->frame_len && pkt_hdr->ref_hdr) {
+		offset   -= (pkt_hdr->frame_len - pkt_hdr->ref_offset);
+		offset   += (pkt_hdr->ref_hdr->frame_len - pkt_hdr->ref_len);
+		pkt_hdr   = pkt_hdr->ref_hdr;
+	}
+
+	ref_hdr->ref_hdr    = pkt_hdr;
+	ref_hdr->ref_offset = offset;
+	ref_hdr->ref_len    = pkt_hdr->frame_len;
+
+	if (offset < pkt_hdr->unshared_len)
+		pkt_hdr->unshared_len = offset;
+
+	packet_ref(pkt_hdr);
+	return _odp_packet_hdl(ref_hdr);
+}
+
+odp_packet_t odp_packet_ref_static(odp_packet_t pkt)
+{
+	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
+
+	pkt_hdr->unshared_len = 0;
+	packet_ref(pkt_hdr);
+	return pkt;
+}
+
+odp_packet_t odp_packet_ref(odp_packet_t pkt, uint32_t offset)
+{
+	odp_packet_t hdr;
+	odp_packet_hdr_t *pkt_hdr;
+
+	if (pkt == ODP_PACKET_INVALID)
+		return ODP_PACKET_INVALID;
+
+	pkt_hdr = odp_packet_hdr(pkt);
+	if (offset >= packet_len(pkt_hdr))
+		return ODP_PACKET_INVALID;
+
+	hdr = odp_packet_alloc(odp_packet_pool(pkt), 0);
+
+	if (hdr == ODP_PACKET_INVALID)
+		return ODP_PACKET_INVALID;
+
+	return packet_splice(pkt_hdr, offset, odp_packet_hdr(hdr));
+}
+
+odp_packet_t odp_packet_ref_pkt(odp_packet_t pkt, uint32_t offset,
+				odp_packet_t hdr)
+{
+	odp_packet_hdr_t *pkt_hdr;
+
+	if (pkt == ODP_PACKET_INVALID ||
+	    hdr == ODP_PACKET_INVALID ||
+	    pkt == hdr)
+		return ODP_PACKET_INVALID;
+
+	ODP_ASSERT(odp_packet_has_ref(hdr) == 0);
+
+	pkt_hdr = odp_packet_hdr(pkt);
+	if (offset >= packet_len(pkt_hdr))
+		return ODP_PACKET_INVALID;
+
+	return packet_splice(pkt_hdr, offset, odp_packet_hdr(hdr));
+}
+
+int odp_packet_has_ref(odp_packet_t pkt)
+{
+	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
+
+	return pkt_hdr->ref_hdr != NULL || packet_ref_count(pkt_hdr) > 1;
+}
+
+/*
  *
  * Copy
  * ********************************************************
@@ -1585,8 +1908,7 @@  int odp_packet_split(odp_packet_t *pkt, uint32_t len, odp_packet_t *tail)
 
 odp_packet_t odp_packet_copy(odp_packet_t pkt, odp_pool_t pool)
 {
-	odp_packet_hdr_t *srchdr = odp_packet_hdr(pkt);
-	uint32_t pktlen = srchdr->frame_len;
+	uint32_t pktlen = odp_packet_len(pkt);
 	odp_packet_t newpkt = odp_packet_alloc(pool, pktlen);
 
 	if (newpkt != ODP_PACKET_INVALID) {
@@ -1625,7 +1947,7 @@  int odp_packet_copy_to_mem(odp_packet_t pkt, uint32_t offset,
 	uint8_t *dstaddr = (uint8_t *)dst;
 	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
 
-	if (offset + len > pkt_hdr->frame_len)
+	if (offset + len > packet_len(pkt_hdr))
 		return -1;
 
 	while (len > 0) {
@@ -1649,9 +1971,11 @@  int odp_packet_copy_from_mem(odp_packet_t pkt, uint32_t offset,
 	const uint8_t *srcaddr = (const uint8_t *)src;
 	odp_packet_hdr_t *pkt_hdr = odp_packet_hdr(pkt);
 
-	if (offset + len > pkt_hdr->frame_len)
+	if (offset + len > packet_len(pkt_hdr))
 		return -1;
 
+	ODP_ASSERT(odp_packet_unshared_len(pkt) >= offset + len);
+
 	while (len > 0) {
 		mapaddr = packet_map(pkt_hdr, offset, &seglen, NULL);
 		cpylen = len > seglen ? seglen : len;
@@ -1677,10 +2001,12 @@  int odp_packet_copy_from_pkt(odp_packet_t dst, uint32_t dst_offset,
 	uint32_t src_seglen = 0; /* GCC */
 	int overlap;
 
-	if (dst_offset + len > dst_hdr->frame_len ||
-	    src_offset + len > src_hdr->frame_len)
+	if (dst_offset + len > packet_len(dst_hdr) ||
+	    src_offset + len > packet_len(src_hdr))
 		return -1;
 
+	ODP_ASSERT(odp_packet_unshared_len(dst) >= dst_offset + len);
+
 	overlap = (dst_hdr == src_hdr &&
 		   ((dst_offset <= src_offset &&
 		     dst_offset + len >= src_offset) ||
@@ -1764,7 +2090,7 @@  void odp_packet_print(odp_packet_t pkt)
 	len += snprintf(&str[len], n - len,
 			"  l4_offset    %" PRIu32 "\n", hdr->p.l4_offset);
 	len += snprintf(&str[len], n - len,
-			"  frame_len    %" PRIu32 "\n", hdr->frame_len);
+			"  frame_len    %" PRIu32 "\n", packet_len(hdr));
 	len += snprintf(&str[len], n - len,
 			"  input        %" PRIu64 "\n",
 			odp_pktio_to_u64(hdr->input));