diff mbox

[v2,1/3] linux-gen: pool: optimize thread local buffer cache

Message ID 1469022168-28941-1-git-send-email-matias.elo@nokia.com
State Superseded
Headers show

Commit Message

Elo, Matias (Nokia - FI/Espoo) July 20, 2016, 1:42 p.m. UTC
Optimize local buffer cache performance which is critical to
many use cases - including packet IO.

Main parts of the optimization are:
 * Local cache implemented as an array of buf_hdr pointers,
   instead of a linked list (which causes a lot of cache misses)
 * Alloc and free N buffers per operation

All above steps are needed to demonstrate the performance upgrade.
Some related pool functions (get_buf(), ret_buf(), etc) were moved
from pool header to c source file, since those were actual local
to the c source file. Also some unused pool variables are removed
also.

Signed-off-by: Petri Savolainen <petri.savolainen@nokia.com>

Signed-off-by: Matias Elo <matias.elo@nokia.com>

---

V2:
- Split pktio modifications into a separate patch (Bill)
- Improve performance by adding separate functions for single buffer
  alloc/free operations

 .../linux-generic/include/odp_buffer_inlines.h     |  26 +-
 .../linux-generic/include/odp_buffer_internal.h    |   5 +-
 platform/linux-generic/include/odp_internal.h      |   2 -
 platform/linux-generic/include/odp_pool_internal.h | 143 +------
 platform/linux-generic/odp_buffer.c                |   3 -
 platform/linux-generic/odp_packet.c                |   5 +-
 platform/linux-generic/odp_pool.c                  | 473 +++++++++++++++++----
 7 files changed, 426 insertions(+), 231 deletions(-)

-- 
2.7.4
diff mbox

Patch

diff --git a/platform/linux-generic/include/odp_buffer_inlines.h b/platform/linux-generic/include/odp_buffer_inlines.h
index 3f4d9fd..2b1ab42 100644
--- a/platform/linux-generic/include/odp_buffer_inlines.h
+++ b/platform/linux-generic/include/odp_buffer_inlines.h
@@ -56,30 +56,12 @@  static inline odp_buffer_hdr_t *odp_buf_to_hdr(odp_buffer_t buf)
 		(pool->pool_mdata_addr + (index * ODP_CACHE_LINE_SIZE));
 }
 
-static inline uint32_t odp_buffer_refcount(odp_buffer_hdr_t *buf)
+static inline uint32_t pool_id_from_buf(odp_buffer_t buf)
 {
-	return odp_atomic_load_u32(&buf->ref_count);
-}
+	odp_buffer_bits_t handle;
 
-static inline uint32_t odp_buffer_incr_refcount(odp_buffer_hdr_t *buf,
-						uint32_t val)
-{
-	return odp_atomic_fetch_add_u32(&buf->ref_count, val) + val;
-}
-
-static inline uint32_t odp_buffer_decr_refcount(odp_buffer_hdr_t *buf,
-						uint32_t val)
-{
-	uint32_t tmp;
-
-	tmp = odp_atomic_fetch_sub_u32(&buf->ref_count, val);
-
-	if (tmp < val) {
-		odp_atomic_fetch_add_u32(&buf->ref_count, val - tmp);
-		return 0;
-	} else {
-		return tmp - val;
-	}
+	handle.handle = buf;
+	return handle.pool_id;
 }
 
 static inline odp_buffer_hdr_t *validate_buf(odp_buffer_t buf)
diff --git a/platform/linux-generic/include/odp_buffer_internal.h b/platform/linux-generic/include/odp_buffer_internal.h
index f21364c..7b0ef8b 100644
--- a/platform/linux-generic/include/odp_buffer_internal.h
+++ b/platform/linux-generic/include/odp_buffer_internal.h
@@ -114,7 +114,6 @@  struct odp_buffer_hdr_t {
 	union {
 		uint32_t all;
 		struct {
-			uint32_t zeroized:1; /* Zeroize buf data on free */
 			uint32_t hdrdata:1;  /* Data is in buffer hdr */
 			uint32_t sustain:1;  /* Sustain order */
 		};
@@ -123,7 +122,6 @@  struct odp_buffer_hdr_t {
 	int8_t                   type;       /* buffer type */
 	odp_event_type_t         event_type; /* for reuse as event */
 	uint32_t                 size;       /* max data size */
-	odp_atomic_u32_t         ref_count;  /* reference count */
 	odp_pool_t               pool_hdl;   /* buffer pool handle */
 	union {
 		uint64_t         buf_u64;    /* user u64 */
@@ -174,6 +172,9 @@  typedef struct {
 odp_buffer_t buffer_alloc(odp_pool_t pool, size_t size);
 int buffer_alloc_multi(odp_pool_t pool_hdl, size_t size,
 		       odp_buffer_t buf[], int num);
+void buffer_free(uint32_t pool_id, const odp_buffer_t buf);
+void buffer_free_multi(uint32_t pool_id,
+		       const odp_buffer_t buf[], int num_free);
 int seg_alloc_head(odp_buffer_hdr_t *buf_hdr, int segcount);
 void seg_free_head(odp_buffer_hdr_t *buf_hdr, int segcount);
 int seg_alloc_tail(odp_buffer_hdr_t *buf_hdr, int segcount);
diff --git a/platform/linux-generic/include/odp_internal.h b/platform/linux-generic/include/odp_internal.h
index d12f850..8bad450 100644
--- a/platform/linux-generic/include/odp_internal.h
+++ b/platform/linux-generic/include/odp_internal.h
@@ -119,8 +119,6 @@  int odp_tm_term_global(void);
 int _odp_int_name_tbl_init_global(void);
 int _odp_int_name_tbl_term_global(void);
 
-void _odp_flush_caches(void);
-
 int cpuinfo_parser(FILE *file, system_info_t *sysinfo);
 uint64_t odp_cpu_hz_current(int id);
 
diff --git a/platform/linux-generic/include/odp_pool_internal.h b/platform/linux-generic/include/odp_pool_internal.h
index 3317bd0..d6717ff 100644
--- a/platform/linux-generic/include/odp_pool_internal.h
+++ b/platform/linux-generic/include/odp_pool_internal.h
@@ -51,15 +51,25 @@  typedef struct _odp_buffer_pool_init_t {
 	void *buf_init_arg;        /**< Argument to be passed to buf_init() */
 } _odp_buffer_pool_init_t;         /**< Type of buffer initialization struct */
 
+#define POOL_MAX_LOCAL_CHUNKS 4
+#define POOL_CHUNK_SIZE       32
+#define POOL_MAX_LOCAL_BUFS   (POOL_MAX_LOCAL_CHUNKS * POOL_CHUNK_SIZE)
+
+struct local_cache_s {
+	uint64_t bufallocs;  /* Local buffer alloc count */
+	uint64_t buffrees;   /* Local buffer free count */
+
+	uint32_t num_buf;
+	odp_buffer_hdr_t *buf[POOL_MAX_LOCAL_BUFS];
+};
+
 /* Local cache for buffer alloc/free acceleration */
 typedef struct local_cache_t {
 	union {
-		struct {
-			odp_buffer_hdr_t *buf_freelist;  /* The local cache */
-			uint64_t bufallocs;  /* Local buffer alloc count */
-			uint64_t buffrees;   /* Local buffer free count */
-		};
-		uint8_t pad[ODP_CACHE_LINE_SIZE_ROUNDUP(sizeof(uint64_t))];
+		struct local_cache_s s;
+
+		uint8_t pad[ODP_CACHE_LINE_SIZE_ROUNDUP(
+			    sizeof(struct local_cache_s))];
 	};
 } local_cache_t;
 
@@ -214,127 +224,6 @@  static inline void ret_blk(struct pool_entry_s *pool, void *block)
 	odp_atomic_inc_u64(&pool->poolstats.blkfrees);
 }
 
-static inline odp_buffer_hdr_t *get_buf(struct pool_entry_s *pool)
-{
-	odp_buffer_hdr_t *myhead;
-	POOL_LOCK(&pool->buf_lock);
-
-	myhead = pool->buf_freelist;
-
-	if (odp_unlikely(myhead == NULL)) {
-		POOL_UNLOCK(&pool->buf_lock);
-		odp_atomic_inc_u64(&pool->poolstats.bufempty);
-	} else {
-		pool->buf_freelist = myhead->next;
-		POOL_UNLOCK(&pool->buf_lock);
-		uint64_t bufcount =
-			odp_atomic_fetch_sub_u32(&pool->bufcount, 1) - 1;
-
-		/* Check for low watermark condition */
-		if (bufcount == pool->buf_low_wm && !pool->buf_low_wm_assert) {
-			pool->buf_low_wm_assert = 1;
-			odp_atomic_inc_u64(&pool->poolstats.buf_low_wm_count);
-		}
-
-		odp_atomic_inc_u64(&pool->poolstats.bufallocs);
-	}
-
-	return (void *)myhead;
-}
-
-static inline void ret_buf(struct pool_entry_s *pool, odp_buffer_hdr_t *buf)
-{
-	if (!buf->flags.hdrdata && buf->type != ODP_EVENT_BUFFER) {
-		while (buf->segcount > 0) {
-			if (buffer_is_secure(buf) || pool_is_secure(pool))
-				memset(buf->addr[buf->segcount - 1],
-				       0, buf->segsize);
-			ret_blk(pool, buf->addr[--buf->segcount]);
-		}
-		buf->size = 0;
-	}
-
-	buf->allocator = ODP_FREEBUF;  /* Mark buffer free */
-	POOL_LOCK(&pool->buf_lock);
-	buf->next = pool->buf_freelist;
-	pool->buf_freelist = buf;
-	POOL_UNLOCK(&pool->buf_lock);
-
-	uint64_t bufcount = odp_atomic_fetch_add_u32(&pool->bufcount, 1) + 1;
-
-	/* Check if low watermark condition should be deasserted */
-	if (bufcount == pool->buf_high_wm && pool->buf_low_wm_assert) {
-		pool->buf_low_wm_assert = 0;
-		odp_atomic_inc_u64(&pool->poolstats.buf_high_wm_count);
-	}
-
-	odp_atomic_inc_u64(&pool->poolstats.buffrees);
-}
-
-static inline void *get_local_buf(local_cache_t *buf_cache,
-				  struct pool_entry_s *pool,
-				  size_t totsize)
-{
-	odp_buffer_hdr_t *buf = buf_cache->buf_freelist;
-
-	if (odp_likely(buf != NULL)) {
-		buf_cache->buf_freelist = buf->next;
-
-		if (odp_unlikely(buf->size < totsize)) {
-			intmax_t needed = totsize - buf->size;
-
-			do {
-				void *blk = get_blk(pool);
-				if (odp_unlikely(blk == NULL)) {
-					ret_buf(pool, buf);
-					buf_cache->buffrees--;
-					return NULL;
-				}
-				buf->addr[buf->segcount++] = blk;
-				needed -= pool->seg_size;
-			} while (needed > 0);
-
-			buf->size = buf->segcount * pool->seg_size;
-		}
-
-		buf_cache->bufallocs++;
-	}
-
-	return buf;
-}
-
-static inline void ret_local_buf(local_cache_t *buf_cache,
-				odp_buffer_hdr_t *buf)
-{
-	buf->allocator = ODP_FREEBUF;
-	buf->next = buf_cache->buf_freelist;
-	buf_cache->buf_freelist = buf;
-
-	buf_cache->buffrees++;
-}
-
-static inline void flush_cache(local_cache_t *buf_cache,
-			       struct pool_entry_s *pool)
-{
-	odp_buffer_hdr_t *buf = buf_cache->buf_freelist;
-	uint32_t flush_count = 0;
-
-	while (buf != NULL) {
-		odp_buffer_hdr_t *next = buf->next;
-		ret_buf(pool, buf);
-		buf = next;
-		flush_count++;
-	}
-
-	odp_atomic_add_u64(&pool->poolstats.bufallocs, buf_cache->bufallocs);
-	odp_atomic_add_u64(&pool->poolstats.buffrees,
-			   buf_cache->buffrees - flush_count);
-
-	buf_cache->buf_freelist = NULL;
-	buf_cache->bufallocs = 0;
-	buf_cache->buffrees = 0;
-}
-
 static inline odp_pool_t pool_index_to_handle(uint32_t pool_id)
 {
 	return _odp_cast_scalar(odp_pool_t, pool_id);
diff --git a/platform/linux-generic/odp_buffer.c b/platform/linux-generic/odp_buffer.c
index e7e4d58..ce2fdba 100644
--- a/platform/linux-generic/odp_buffer.c
+++ b/platform/linux-generic/odp_buffer.c
@@ -67,9 +67,6 @@  int odp_buffer_snprint(char *str, uint32_t n, odp_buffer_t buf)
 	len += snprintf(&str[len], n-len,
 			"  size         %" PRIu32 "\n",        hdr->size);
 	len += snprintf(&str[len], n-len,
-			"  ref_count    %" PRIu32 "\n",
-			odp_atomic_load_u32(&hdr->ref_count));
-	len += snprintf(&str[len], n-len,
 			"  type         %i\n",        hdr->type);
 
 	return len;
diff --git a/platform/linux-generic/odp_packet.c b/platform/linux-generic/odp_packet.c
index 0e319d2..474fa81 100644
--- a/platform/linux-generic/odp_packet.c
+++ b/platform/linux-generic/odp_packet.c
@@ -972,10 +972,7 @@  int _odp_packet_copy_md_to_packet(odp_packet_t srcpkt, odp_packet_t dstpkt)
 		       srchdr->buf_hdr.uarea_size ?
 		       dsthdr->buf_hdr.uarea_size :
 		       srchdr->buf_hdr.uarea_size);
-	odp_atomic_store_u32(
-		&dsthdr->buf_hdr.ref_count,
-		odp_atomic_load_u32(
-			&srchdr->buf_hdr.ref_count));
+
 	copy_packet_parser_metadata(srchdr, dsthdr);
 
 	/* Metadata copied, but return indication of whether the packet
diff --git a/platform/linux-generic/odp_pool.c b/platform/linux-generic/odp_pool.c
index 419f03f..0a427ed 100644
--- a/platform/linux-generic/odp_pool.c
+++ b/platform/linux-generic/odp_pool.c
@@ -57,8 +57,15 @@  static const char SHM_DEFAULT_NAME[] = "odp_buffer_pools";
 /* Pool entry pointers (for inlining) */
 void *pool_entry_ptr[ODP_CONFIG_POOLS];
 
-/* Cache thread id locally for local cache performance */
-static __thread int local_id;
+/* Thread local variables */
+typedef struct pool_local_t {
+	local_cache_t *cache[ODP_CONFIG_POOLS];
+	int thr_id;
+} pool_local_t;
+
+static __thread pool_local_t local;
+
+static void flush_cache(local_cache_t *buf_cache, struct pool_entry_s *pool);
 
 int odp_pool_init_global(void)
 {
@@ -111,7 +118,19 @@  int odp_pool_init_global(void)
 
 int odp_pool_init_local(void)
 {
-	local_id = odp_thread_id();
+	pool_entry_t *pool;
+	int i;
+	int thr_id = odp_thread_id();
+
+	memset(&local, 0, sizeof(pool_local_t));
+
+	for (i = 0; i < ODP_CONFIG_POOLS; i++) {
+		pool           = get_pool_entry(i);
+		local.cache[i] = &pool->s.local_cache[thr_id];
+		local.cache[i]->s.num_buf = 0;
+	}
+
+	local.thr_id = thr_id;
 	return 0;
 }
 
@@ -144,7 +163,14 @@  int odp_pool_term_global(void)
 
 int odp_pool_term_local(void)
 {
-	_odp_flush_caches();
+	int i;
+
+	for (i = 0; i < ODP_CONFIG_POOLS; i++) {
+		pool_entry_t *pool = get_pool_entry(i);
+
+		flush_cache(local.cache[i], &pool->s);
+	}
+
 	return 0;
 }
 
@@ -179,10 +205,53 @@  int odp_pool_capability(odp_pool_capability_t *capa)
 	return 0;
 }
 
-/**
+static inline odp_buffer_hdr_t *get_buf(struct pool_entry_s *pool)
+{
+	odp_buffer_hdr_t *myhead;
+
+	POOL_LOCK(&pool->buf_lock);
+
+	myhead = pool->buf_freelist;
+
+	if (odp_unlikely(myhead == NULL)) {
+		POOL_UNLOCK(&pool->buf_lock);
+		odp_atomic_inc_u64(&pool->poolstats.bufempty);
+	} else {
+		pool->buf_freelist = myhead->next;
+		POOL_UNLOCK(&pool->buf_lock);
+
+		odp_atomic_fetch_sub_u32(&pool->bufcount, 1);
+		odp_atomic_inc_u64(&pool->poolstats.bufallocs);
+	}
+
+	return (void *)myhead;
+}
+
+static inline void ret_buf(struct pool_entry_s *pool, odp_buffer_hdr_t *buf)
+{
+	if (!buf->flags.hdrdata && buf->type != ODP_EVENT_BUFFER) {
+		while (buf->segcount > 0) {
+			if (buffer_is_secure(buf) || pool_is_secure(pool))
+				memset(buf->addr[buf->segcount - 1],
+				       0, buf->segsize);
+			ret_blk(pool, buf->addr[--buf->segcount]);
+		}
+		buf->size = 0;
+	}
+
+	buf->allocator = ODP_FREEBUF;  /* Mark buffer free */
+	POOL_LOCK(&pool->buf_lock);
+	buf->next = pool->buf_freelist;
+	pool->buf_freelist = buf;
+	POOL_UNLOCK(&pool->buf_lock);
+
+	odp_atomic_fetch_add_u32(&pool->bufcount, 1);
+	odp_atomic_inc_u64(&pool->poolstats.buffrees);
+}
+
+/*
  * Pool creation
  */
-
 odp_pool_t _pool_create(const char *name,
 			odp_pool_param_t *params,
 			uint32_t shmflags)
@@ -208,9 +277,6 @@  odp_pool_t _pool_create(const char *name,
 	/* Restriction for v1.0: All non-packet buffers are unsegmented */
 	int unseg = 1;
 
-	/* Restriction for v1.0: No zeroization support */
-	const int zeroized = 0;
-
 	uint32_t blk_size, buf_stride, buf_num, blk_num, seg_len = 0;
 	uint32_t buf_align =
 		params->type == ODP_POOL_BUFFER ? params->buf.align : 0;
@@ -350,7 +416,6 @@  odp_pool_t _pool_create(const char *name,
 		POOL_UNLOCK(&pool->s.lock);
 
 		pool->s.flags.unsegmented = unseg;
-		pool->s.flags.zeroized = zeroized;
 		pool->s.seg_size = unseg ? blk_size : seg_len;
 		pool->s.blk_size = blk_size;
 
@@ -383,9 +448,7 @@  odp_pool_t _pool_create(const char *name,
 			/* Iniitalize buffer metadata */
 			tmp->allocator = ODP_FREEBUF;
 			tmp->flags.all = 0;
-			tmp->flags.zeroized = zeroized;
 			tmp->size = 0;
-			odp_atomic_init_u32(&tmp->ref_count, 0);
 			tmp->type = params->type;
 			tmp->event_type = params->type;
 			tmp->pool_hdl = pool->s.pool_hdl;
@@ -503,6 +566,41 @@  int odp_pool_info(odp_pool_t pool_hdl, odp_pool_info_t *info)
 	return 0;
 }
 
+static inline void get_local_cache_bufs(local_cache_t *buf_cache, uint32_t idx,
+					odp_buffer_hdr_t *buf_hdr[],
+					uint32_t num)
+{
+	uint32_t i;
+
+	for (i = 0; i < num; i++) {
+		buf_hdr[i] = buf_cache->s.buf[idx + i];
+		odp_prefetch(buf_hdr[i]);
+		odp_prefetch_store(buf_hdr[i]);
+	}
+}
+
+static void flush_cache(local_cache_t *buf_cache, struct pool_entry_s *pool)
+{
+	uint32_t flush_count = 0;
+	uint32_t num;
+
+	while ((num = buf_cache->s.num_buf)) {
+		odp_buffer_hdr_t *buf;
+
+		buf = buf_cache->s.buf[num - 1];
+		ret_buf(pool, buf);
+		flush_count++;
+		buf_cache->s.num_buf--;
+	}
+
+	odp_atomic_add_u64(&pool->poolstats.bufallocs, buf_cache->s.bufallocs);
+	odp_atomic_add_u64(&pool->poolstats.buffrees,
+			   buf_cache->s.buffrees - flush_count);
+
+	buf_cache->s.bufallocs = 0;
+	buf_cache->s.buffrees = 0;
+}
+
 int odp_pool_destroy(odp_pool_t pool_hdl)
 {
 	uint32_t pool_id = pool_handle_to_index(pool_hdl);
@@ -621,71 +719,207 @@  void seg_free_tail(odp_buffer_hdr_t *buf_hdr, int segcount)
 	buf_hdr->size      = buf_hdr->segcount * pool->s.seg_size;
 }
 
-odp_buffer_t buffer_alloc(odp_pool_t pool_hdl, size_t size)
+static inline int get_local_bufs(local_cache_t *buf_cache,
+				 odp_buffer_hdr_t *buf_hdr[], uint32_t max_num)
+{
+	uint32_t num_buf = buf_cache->s.num_buf;
+	uint32_t num = num_buf;
+
+	if (odp_unlikely(num_buf == 0))
+		return 0;
+
+	if (odp_likely(max_num < num))
+		num = max_num;
+
+	get_local_cache_bufs(buf_cache, num_buf - num, buf_hdr, num);
+	buf_cache->s.num_buf   -= num;
+	buf_cache->s.bufallocs += num;
+
+	return num;
+}
+
+static inline void ret_local_buf(local_cache_t *buf_cache, uint32_t idx,
+				 odp_buffer_hdr_t *buf)
+{
+	buf_cache->s.buf[idx] = buf;
+	buf_cache->s.num_buf++;
+	buf_cache->s.buffrees++;
+}
+
+static inline void ret_local_bufs(local_cache_t *buf_cache, uint32_t idx,
+				  odp_buffer_hdr_t *buf[], int num_buf)
+{
+	int i;
+
+	for (i = 0; i < num_buf; i++)
+		buf_cache->s.buf[idx + i] = buf[i];
+
+	buf_cache->s.num_buf  += num_buf;
+	buf_cache->s.buffrees += num_buf;
+}
+
+int buffer_alloc_multi(odp_pool_t pool_hdl, size_t size,
+		       odp_buffer_t buf[], int max_num)
 {
 	uint32_t pool_id = pool_handle_to_index(pool_hdl);
 	pool_entry_t *pool = get_pool_entry(pool_id);
 	uintmax_t totsize = pool->s.headroom + size + pool->s.tailroom;
-	odp_anybuf_t *buf;
+	odp_buffer_hdr_t *buf_tbl[max_num];
+	odp_buffer_hdr_t *buf_hdr;
+	int num, i;
+	intmax_t needed;
+	void *blk;
 
 	/* Reject oversized allocation requests */
 	if ((pool->s.flags.unsegmented && totsize > pool->s.seg_size) ||
 	    (!pool->s.flags.unsegmented &&
 	     totsize > pool->s.seg_size * ODP_BUFFER_MAX_SEG))
-		return ODP_BUFFER_INVALID;
+		return 0;
 
 	/* Try to satisfy request from the local cache */
-	buf = (odp_anybuf_t *)
-		(void *)get_local_buf(&pool->s.local_cache[local_id],
-				      &pool->s, totsize);
+	num = get_local_bufs(local.cache[pool_id], buf_tbl, max_num);
 
 	/* If cache is empty, satisfy request from the pool */
-	if (odp_unlikely(buf == NULL)) {
-		buf = (odp_anybuf_t *)(void *)get_buf(&pool->s);
+	if (odp_unlikely(num < max_num)) {
+		for (; num < max_num; num++) {
+			buf_hdr = get_buf(&pool->s);
 
-		if (odp_unlikely(buf == NULL))
+			if (odp_unlikely(buf_hdr == NULL))
+				goto pool_empty;
+
+			/* Get blocks for this buffer, if pool uses
+			 * application data */
+			if (buf_hdr->size < totsize) {
+				uint32_t segcount;
+
+				needed = totsize - buf_hdr->size;
+				do {
+					blk = get_blk(&pool->s);
+					if (odp_unlikely(blk == NULL)) {
+						ret_buf(&pool->s, buf_hdr);
+						goto pool_empty;
+					}
+
+					segcount = buf_hdr->segcount++;
+					buf_hdr->addr[segcount] = blk;
+					needed -= pool->s.seg_size;
+				} while (needed > 0);
+				buf_hdr->size = buf_hdr->segcount *
+						pool->s.seg_size;
+			}
+
+			buf_tbl[num] = buf_hdr;
+		}
+	}
+
+pool_empty:
+	for (i = 0; i < num; i++) {
+		buf_hdr = buf_tbl[i];
+
+		/* Mark buffer as allocated */
+		buf_hdr->allocator = local.thr_id;
+
+		/* By default, buffers are not associated with
+		 * an ordered queue */
+		buf_hdr->origin_qe = NULL;
+
+		buf[i] = odp_hdr_to_buf(buf_hdr);
+
+		/* Add more segments if buffer from local cache is too small */
+		if (odp_unlikely(buf_hdr->size < totsize)) {
+			needed = totsize - buf_hdr->size;
+			do {
+				blk = get_blk(&pool->s);
+				if (odp_unlikely(blk == NULL)) {
+					int j;
+
+					ret_buf(&pool->s, buf_hdr);
+					buf_hdr = NULL;
+					local.cache[pool_id]->s.buffrees--;
+
+					/* move remaining bufs up one step
+					 * and update loop counters */
+					num--;
+					for (j = i; j < num; j++)
+						buf_tbl[j] = buf_tbl[j + 1];
+
+					i--;
+					break;
+				}
+				needed -= pool->s.seg_size;
+				buf_hdr->addr[buf_hdr->segcount++] = blk;
+				buf_hdr->size = buf_hdr->segcount *
+						pool->s.seg_size;
+			} while (needed > 0);
+		}
+	}
+
+	return num;
+}
+
+odp_buffer_t buffer_alloc(odp_pool_t pool_hdl, size_t size)
+{
+	uint32_t pool_id = pool_handle_to_index(pool_hdl);
+	pool_entry_t *pool = get_pool_entry(pool_id);
+	uintmax_t totsize = pool->s.headroom + size + pool->s.tailroom;
+	odp_buffer_hdr_t *buf_hdr;
+	intmax_t needed;
+	void *blk;
+
+	/* Reject oversized allocation requests */
+	if ((pool->s.flags.unsegmented && totsize > pool->s.seg_size) ||
+	    (!pool->s.flags.unsegmented &&
+	     totsize > pool->s.seg_size * ODP_BUFFER_MAX_SEG))
+		return 0;
+
+	/* Try to satisfy request from the local cache. If cache is empty,
+	 * satisfy request from the pool */
+	if (odp_unlikely(!get_local_bufs(local.cache[pool_id], &buf_hdr, 1))) {
+		buf_hdr = get_buf(&pool->s);
+
+		if (odp_unlikely(buf_hdr == NULL))
 			return ODP_BUFFER_INVALID;
 
 		/* Get blocks for this buffer, if pool uses application data */
-		if (buf->buf.size < totsize) {
-			intmax_t needed = totsize - buf->buf.size;
+		if (buf_hdr->size < totsize) {
+			needed = totsize - buf_hdr->size;
 			do {
-				uint8_t *blk = get_blk(&pool->s);
-				if (blk == NULL) {
-					ret_buf(&pool->s, &buf->buf);
+				blk = get_blk(&pool->s);
+				if (odp_unlikely(blk == NULL)) {
+					ret_buf(&pool->s, buf_hdr);
 					return ODP_BUFFER_INVALID;
 				}
-				buf->buf.addr[buf->buf.segcount++] = blk;
+				buf_hdr->addr[buf_hdr->segcount++] = blk;
 				needed -= pool->s.seg_size;
 			} while (needed > 0);
-			buf->buf.size = buf->buf.segcount * pool->s.seg_size;
+			buf_hdr->size = buf_hdr->segcount * pool->s.seg_size;
 		}
 	}
-
 	/* Mark buffer as allocated */
-	buf->buf.allocator = local_id;
+	buf_hdr->allocator = local.thr_id;
 
-	/* By default, buffers inherit their pool's zeroization setting */
-	buf->buf.flags.zeroized = pool->s.flags.zeroized;
+	/* By default, buffers are not associated with
+	 * an ordered queue */
+	buf_hdr->origin_qe = NULL;
 
-	/* By default, buffers are not associated with an ordered queue */
-	buf->buf.origin_qe = NULL;
-
-	return odp_hdr_to_buf(&buf->buf);
-}
-
-int buffer_alloc_multi(odp_pool_t pool_hdl, size_t size,
-		       odp_buffer_t buf[], int num)
-{
-	int count;
-
-	for (count = 0; count < num; ++count) {
-		buf[count] = buffer_alloc(pool_hdl, size);
-		if (buf[count] == ODP_BUFFER_INVALID)
-			break;
+	/* Add more segments if buffer from local cache is too small */
+	if (odp_unlikely(buf_hdr->size < totsize)) {
+		needed = totsize - buf_hdr->size;
+		do {
+			blk = get_blk(&pool->s);
+			if (odp_unlikely(blk == NULL)) {
+				ret_buf(&pool->s, buf_hdr);
+				buf_hdr = NULL;
+				local.cache[pool_id]->s.buffrees--;
+				return ODP_BUFFER_INVALID;
+			}
+			buf_hdr->addr[buf_hdr->segcount++] = blk;
+			needed -= pool->s.seg_size;
+		} while (needed > 0);
+		buf_hdr->size = buf_hdr->segcount * pool->s.seg_size;
 	}
 
-	return count;
+	return odp_hdr_to_buf(buf_hdr);
 }
 
 odp_buffer_t odp_buffer_alloc(odp_pool_t pool_hdl)
@@ -701,35 +935,132 @@  int odp_buffer_alloc_multi(odp_pool_t pool_hdl, odp_buffer_t buf[], int num)
 	return buffer_alloc_multi(pool_hdl, buf_size, buf, num);
 }
 
-void odp_buffer_free(odp_buffer_t buf)
+static void multi_pool_free(odp_buffer_hdr_t *buf_hdr[], int num_buf)
 {
-	odp_buffer_hdr_t *buf_hdr = odp_buf_to_hdr(buf);
-	pool_entry_t *pool = odp_buf_to_pool(buf_hdr);
+	uint32_t pool_id, num;
+	local_cache_t *buf_cache;
+	pool_entry_t *pool;
+	int i, j, idx;
 
+	for (i = 0; i < num_buf; i++) {
+		pool_id   =  pool_handle_to_index(buf_hdr[i]->pool_hdl);
+		buf_cache = local.cache[pool_id];
+		num       = buf_cache->s.num_buf;
+
+		if (num < POOL_MAX_LOCAL_BUFS) {
+			ret_local_buf(buf_cache, num, buf_hdr[i]);
+			continue;
+		}
+
+		idx  = POOL_MAX_LOCAL_BUFS - POOL_CHUNK_SIZE;
+		pool = get_pool_entry(pool_id);
+
+		/* local cache full, return a chunk */
+		for (j = 0; j < POOL_CHUNK_SIZE; j++) {
+			odp_buffer_hdr_t *tmp;
+
+			tmp = buf_cache->s.buf[idx + i];
+			ret_buf(&pool->s, tmp);
+		}
+
+		num = POOL_MAX_LOCAL_BUFS - POOL_CHUNK_SIZE;
+		buf_cache->s.num_buf = num;
+		ret_local_buf(buf_cache, num, buf_hdr[i]);
+	}
+}
+
+void buffer_free_multi(uint32_t pool_id,
+		       const odp_buffer_t buf[], int num_free)
+{
+	local_cache_t *buf_cache = local.cache[pool_id];
+	uint32_t num;
+	int i, idx;
+	pool_entry_t *pool;
+	odp_buffer_hdr_t *buf_hdr[num_free];
+	int multi_pool = 0;
+
+	for (i = 0; i < num_free; i++) {
+		uint32_t id;
+
+		buf_hdr[i] = odp_buf_to_hdr(buf[i]);
+		ODP_ASSERT(buf_hdr[i]->allocator != ODP_FREEBUF);
+		buf_hdr[i]->allocator = ODP_FREEBUF;
+		id = pool_handle_to_index(buf_hdr[i]->pool_hdl);
+		multi_pool |= (pool_id != id);
+	}
+
+	if (odp_unlikely(multi_pool)) {
+		multi_pool_free(buf_hdr, num_free);
+		return;
+	}
+
+	num = buf_cache->s.num_buf;
+
+	if (odp_likely((num + num_free) < POOL_MAX_LOCAL_BUFS)) {
+		ret_local_bufs(buf_cache, num, buf_hdr, num_free);
+		return;
+	}
+
+	pool = get_pool_entry(pool_id);
+
+	/* Return at least one chunk into the global pool */
+	if (odp_unlikely(num_free > POOL_CHUNK_SIZE)) {
+		for (i = 0; i < num_free; i++)
+			ret_buf(&pool->s, buf_hdr[i]);
+
+		return;
+	}
+
+	idx = num - POOL_CHUNK_SIZE;
+	for (i = 0; i < POOL_CHUNK_SIZE; i++)
+		ret_buf(&pool->s, buf_cache->s.buf[idx + i]);
+
+	num -= POOL_CHUNK_SIZE;
+	buf_cache->s.num_buf = num;
+	ret_local_bufs(buf_cache, num, buf_hdr, num_free);
+}
+
+void buffer_free(uint32_t pool_id, const odp_buffer_t buf)
+{
+	local_cache_t *buf_cache = local.cache[pool_id];
+	uint32_t num;
+	int i;
+	pool_entry_t *pool;
+	odp_buffer_hdr_t *buf_hdr;
+
+	buf_hdr = odp_buf_to_hdr(buf);
 	ODP_ASSERT(buf_hdr->allocator != ODP_FREEBUF);
+	buf_hdr->allocator = ODP_FREEBUF;
 
-	if (odp_unlikely(pool->s.buf_low_wm_assert || pool->s.blk_low_wm_assert))
-		ret_buf(&pool->s, buf_hdr);
-	else
-		ret_local_buf(&pool->s.local_cache[local_id], buf_hdr);
+	num = buf_cache->s.num_buf;
+
+	if (odp_likely((num + 1) < POOL_MAX_LOCAL_BUFS)) {
+		ret_local_bufs(buf_cache, num, &buf_hdr, 1);
+		return;
+	}
+
+	pool = get_pool_entry(pool_id);
+
+	num -= POOL_CHUNK_SIZE;
+	for (i = 0; i < POOL_CHUNK_SIZE; i++)
+		ret_buf(&pool->s, buf_cache->s.buf[num + i]);
+
+	buf_cache->s.num_buf = num;
+	ret_local_bufs(buf_cache, num, &buf_hdr, 1);
+}
+
+void odp_buffer_free(odp_buffer_t buf)
+{
+	uint32_t pool_id = pool_id_from_buf(buf);
+
+	buffer_free(pool_id, buf);
 }
 
 void odp_buffer_free_multi(const odp_buffer_t buf[], int num)
 {
-	int i;
+	uint32_t pool_id = pool_id_from_buf(buf[0]);
 
-	for (i = 0; i < num; ++i)
-		odp_buffer_free(buf[i]);
-}
-
-void _odp_flush_caches(void)
-{
-	int i;
-
-	for (i = 0; i < ODP_CONFIG_POOLS; i++) {
-		pool_entry_t *pool = get_pool_entry(i);
-		flush_cache(&pool->s.local_cache[local_id], &pool->s);
-	}
+	buffer_free_multi(pool_id, buf, num);
 }
 
 void odp_pool_print(odp_pool_t pool_hdl)
@@ -774,7 +1105,6 @@  void odp_pool_print(odp_pool_t pool_hdl)
 		pool->s.quiesced ? "quiesced" : "active");
 	ODP_DBG(" pool opts       %s, %s, %s\n",
 		pool->s.flags.unsegmented ? "unsegmented" : "segmented",
-		pool->s.flags.zeroized ? "zeroized" : "non-zeroized",
 		pool->s.flags.predefined  ? "predefined" : "created");
 	ODP_DBG(" pool base       %p\n",  pool->s.pool_base_addr);
 	ODP_DBG(" pool size       %zu (%zu pages)\n",
@@ -817,10 +1147,11 @@  void odp_pool_print(odp_pool_t pool_hdl)
 	ODP_DBG(" blk low wm count    %lu\n", blklowmct);
 }
 
-
 odp_pool_t odp_buffer_pool(odp_buffer_t buf)
 {
-	return odp_buf_to_hdr(buf)->pool_hdl;
+	uint32_t pool_id = pool_id_from_buf(buf);
+
+	return pool_index_to_handle(pool_id);
 }
 
 void odp_pool_param_init(odp_pool_param_t *params)