Message ID | 20250206072102.29045-17-kanchana.p.sridhar@intel.com |
---|---|
State | New |
Headers | show |
Series | zswap IAA compress batching | expand |
On Wed, Feb 5, 2025 at 11:21 PM Kanchana P Sridhar <kanchana.p.sridhar@intel.com> wrote: > > With the previous patch that enables support for batch compressions in > zswap_compress_folio(), a 6.2% throughput regression was seen with zstd and > 2M folios, using vm-scalability/usemem. > > For compressors that don't support batching, this was root-caused to the > following zswap_store_folio() structure: > > Batched stores: > --------------- > - Allocate all entries, > - Compress all entries, > - Store all entries in xarray/LRU. Can you clarify why the above structure leads to performance regression? > > Hence, the above structure is maintained only for batched stores, and the > following structure is implemented for sequential stores of large folio pages, > that fixes the zstd regression, while preserving common code paths for batched > and sequential stores of a folio: > > Sequential stores: > ------------------ > For each page in folio: > - allocate an entry, > - compress the page, > - store the entry in xarray/LRU. > > This is submitted as a separate patch only for code review purposes. I will > squash this with the previous commit in subsequent versions of this > patch-series. > > Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com> > --- > mm/zswap.c | 193 ++++++++++++++++++++++++++++++----------------------- > 1 file changed, 111 insertions(+), 82 deletions(-) > > diff --git a/mm/zswap.c b/mm/zswap.c > index f1cba77eda62..7bfc720a6201 100644 > --- a/mm/zswap.c > +++ b/mm/zswap.c > @@ -1592,40 +1592,32 @@ static bool zswap_batch_compress(struct folio *folio, > return ret; > } > > -static bool zswap_compress_folio(struct folio *folio, > - struct zswap_entry *entries[], > - struct zswap_pool *pool) > +static __always_inline bool zswap_compress_folio(struct folio *folio, > + struct zswap_entry *entries[], > + long from_index, > + struct zswap_pool *pool, > + unsigned int batch_size, > + struct crypto_acomp_ctx *acomp_ctx) > { > - long index, nr_pages = folio_nr_pages(folio); > - struct crypto_acomp_ctx *acomp_ctx; > - unsigned int batch_size; > + long index = from_index, nr_pages = folio_nr_pages(folio); > bool ret = true; > > - acomp_ctx = acomp_ctx_get_cpu_lock(pool); > - batch_size = acomp_ctx->nr_reqs; > - > if ((batch_size > 1) && (nr_pages > 1)) { > - for (index = 0; index < nr_pages; index += batch_size) { > + for (; index < nr_pages; index += batch_size) { > > if (!zswap_batch_compress(folio, index, batch_size, > &entries[index], pool, acomp_ctx)) { > ret = false; > - goto unlock_acomp_ctx; > + break; > } > } > } else { > - for (index = 0; index < nr_pages; ++index) { > - struct page *page = folio_page(folio, index); > + struct page *page = folio_page(folio, index); > > - if (!zswap_compress(page, entries[index], pool, acomp_ctx)) { > - ret = false; > - goto unlock_acomp_ctx; > - } > - } > + if (!zswap_compress(page, entries[index], pool, acomp_ctx)) > + ret = false; > } > > -unlock_acomp_ctx: > - acomp_ctx_put_unlock(acomp_ctx); > return ret; > } > > @@ -1637,92 +1629,128 @@ static bool zswap_compress_folio(struct folio *folio, > * handles to ERR_PTR(-EINVAL) at allocation time, and the fact that the > * entry's handle is subsequently modified only upon a successful zpool_malloc() > * after the page is compressed. > + * > + * For compressors that don't support batching, the following structure > + * showed a performance regression with zstd/2M folios: > + * > + * Batched stores: > + * --------------- > + * - Allocate all entries, > + * - Compress all entries, > + * - Store all entries in xarray/LRU. > + * > + * Hence, the above structure is maintained only for batched stores, and the > + * following structure is implemented for sequential stores of large folio pages, > + * that fixes the regression, while preserving common code paths for batched > + * and sequential stores of a folio: > + * > + * Sequential stores: > + * ------------------ > + * For each page in folio: > + * - allocate an entry, > + * - compress the page, > + * - store the entry in xarray/LRU. > */ > static bool zswap_store_folio(struct folio *folio, > struct obj_cgroup *objcg, > struct zswap_pool *pool) > { > - long index, from_index = 0, nr_pages = folio_nr_pages(folio); > + long index = 0, from_index = 0, nr_pages, nr_folio_pages = folio_nr_pages(folio); > struct zswap_entry **entries = NULL; > + struct crypto_acomp_ctx *acomp_ctx; > int node_id = folio_nid(folio); > + unsigned int batch_size; > > - entries = kmalloc(nr_pages * sizeof(*entries), GFP_KERNEL); > + entries = kmalloc(nr_folio_pages * sizeof(*entries), GFP_KERNEL); > if (!entries) > return false; > > - for (index = 0; index < nr_pages; ++index) { > - entries[index] = zswap_entry_cache_alloc(GFP_KERNEL, node_id); > + acomp_ctx = acomp_ctx_get_cpu_lock(pool); > + batch_size = acomp_ctx->nr_reqs; > > - if (!entries[index]) { > - zswap_reject_kmemcache_fail++; > - nr_pages = index; > - goto store_folio_failed; > + nr_pages = (batch_size > 1) ? nr_folio_pages : 1; > + > + while (1) { > + for (index = from_index; index < nr_pages; ++index) { > + entries[index] = zswap_entry_cache_alloc(GFP_KERNEL, node_id); > + > + if (!entries[index]) { > + zswap_reject_kmemcache_fail++; > + nr_pages = index; > + goto store_folio_failed; > + } > + > + entries[index]->handle = (unsigned long)ERR_PTR(-EINVAL); > } > > - entries[index]->handle = (unsigned long)ERR_PTR(-EINVAL); > - } > + if (!zswap_compress_folio(folio, entries, from_index, pool, batch_size, acomp_ctx)) > + goto store_folio_failed; > > - if (!zswap_compress_folio(folio, entries, pool)) > - goto store_folio_failed; > + for (index = from_index; index < nr_pages; ++index) { > + swp_entry_t page_swpentry = page_swap_entry(folio_page(folio, index)); > + struct zswap_entry *old, *entry = entries[index]; > > - for (index = 0; index < nr_pages; ++index) { > - swp_entry_t page_swpentry = page_swap_entry(folio_page(folio, index)); > - struct zswap_entry *old, *entry = entries[index]; > + old = xa_store(swap_zswap_tree(page_swpentry), > + swp_offset(page_swpentry), > + entry, GFP_KERNEL); > + if (xa_is_err(old)) { > + int err = xa_err(old); > > - old = xa_store(swap_zswap_tree(page_swpentry), > - swp_offset(page_swpentry), > - entry, GFP_KERNEL); > - if (xa_is_err(old)) { > - int err = xa_err(old); > + WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err); > + zswap_reject_alloc_fail++; > + from_index = index; > + goto store_folio_failed; > + } > > - WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err); > - zswap_reject_alloc_fail++; > - from_index = index; > - goto store_folio_failed; > - } > + /* > + * We may have had an existing entry that became stale when > + * the folio was redirtied and now the new version is being > + * swapped out. Get rid of the old. > + */ > + if (old) > + zswap_entry_free(old); > > - /* > - * We may have had an existing entry that became stale when > - * the folio was redirtied and now the new version is being > - * swapped out. Get rid of the old. > - */ > - if (old) > - zswap_entry_free(old); > + /* > + * The entry is successfully compressed and stored in the tree, there is > + * no further possibility of failure. Grab refs to the pool and objcg, > + * charge zswap memory, and increment zswap_stored_pages. > + * The opposite actions will be performed by zswap_entry_free() > + * when the entry is removed from the tree. > + */ > + zswap_pool_get(pool); > + if (objcg) { > + obj_cgroup_get(objcg); > + obj_cgroup_charge_zswap(objcg, entry->length); > + } > + atomic_long_inc(&zswap_stored_pages); > > - /* > - * The entry is successfully compressed and stored in the tree, there is > - * no further possibility of failure. Grab refs to the pool and objcg, > - * charge zswap memory, and increment zswap_stored_pages. > - * The opposite actions will be performed by zswap_entry_free() > - * when the entry is removed from the tree. > - */ > - zswap_pool_get(pool); > - if (objcg) { > - obj_cgroup_get(objcg); > - obj_cgroup_charge_zswap(objcg, entry->length); > + /* > + * We finish initializing the entry while it's already in xarray. > + * This is safe because: > + * > + * 1. Concurrent stores and invalidations are excluded by folio lock. > + * > + * 2. Writeback is excluded by the entry not being on the LRU yet. > + * The publishing order matters to prevent writeback from seeing > + * an incoherent entry. > + */ > + entry->pool = pool; > + entry->swpentry = page_swpentry; > + entry->objcg = objcg; > + entry->referenced = true; > + if (entry->length) { > + INIT_LIST_HEAD(&entry->lru); > + zswap_lru_add(&zswap_list_lru, entry); > + } > } > - atomic_long_inc(&zswap_stored_pages); > > - /* > - * We finish initializing the entry while it's already in xarray. > - * This is safe because: > - * > - * 1. Concurrent stores and invalidations are excluded by folio lock. > - * > - * 2. Writeback is excluded by the entry not being on the LRU yet. > - * The publishing order matters to prevent writeback from seeing > - * an incoherent entry. > - */ > - entry->pool = pool; > - entry->swpentry = page_swpentry; > - entry->objcg = objcg; > - entry->referenced = true; > - if (entry->length) { > - INIT_LIST_HEAD(&entry->lru); > - zswap_lru_add(&zswap_list_lru, entry); > - } > + from_index = nr_pages++; > + > + if (nr_pages > nr_folio_pages) > + break; > } > > + acomp_ctx_put_unlock(acomp_ctx); > kfree(entries); > return true; > > @@ -1734,6 +1762,7 @@ static bool zswap_store_folio(struct folio *folio, > zswap_entry_cache_free(entries[index]); > } > > + acomp_ctx_put_unlock(acomp_ctx); > kfree(entries); > return false; > } > -- > 2.27.0 >
> -----Original Message----- > From: Nhat Pham <nphamcs@gmail.com> > Sent: Thursday, February 20, 2025 3:29 PM > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; > hannes@cmpxchg.org; yosry.ahmed@linux.dev; chengming.zhou@linux.dev; > usamaarif642@gmail.com; ryan.roberts@arm.com; 21cnbao@gmail.com; > akpm@linux-foundation.org; linux-crypto@vger.kernel.org; > herbert@gondor.apana.org.au; davem@davemloft.net; > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com; > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>; > Feghali, Wajdi K <wajdi.k.feghali@intel.com>; Gopal, Vinodh > <vinodh.gopal@intel.com> > Subject: Re: [PATCH v6 16/16] mm: zswap: Fix for zstd performance > regression with 2M folios. > > On Wed, Feb 5, 2025 at 11:21 PM Kanchana P Sridhar > <kanchana.p.sridhar@intel.com> wrote: > > > > With the previous patch that enables support for batch compressions in > > zswap_compress_folio(), a 6.2% throughput regression was seen with zstd > and > > 2M folios, using vm-scalability/usemem. > > > > For compressors that don't support batching, this was root-caused to the > > following zswap_store_folio() structure: > > > > Batched stores: > > --------------- > > - Allocate all entries, > > - Compress all entries, > > - Store all entries in xarray/LRU. > > Can you clarify why the above structure leads to performance regression? Good question. My best hypothesis is that it has to do with a combination of branch mis-predicts and the working set size in the zswap_store_folio() code blocks having to load and iterate over 512 pages in the 3 loops. There could also be caching issues involved as Yosry pointed out, since there are more LLC-store-misses in patch 15 (that has the regression) as compared to patch 16 (that fixes the regression). The v6 cover letter has a section "zstd 2M regression fix details" in which I have posted patch 15 and patch 16 perf event counters, that seem to back up this hypothesis. All I can say for certain is that, for compressors that don't support batching (e.g. zstd), this code flow resolves the regression (I have verified this through 10 runs in each case, to rule out variance): Sequential stores: ------------------ For each page in folio: - allocate an entry, - compress the page, - store the entry in xarray/LRU. Which explains the implementation in patch 16. In any case, please do let me know if you have other insights based on the perf event counts. Also, I have been incorporating the suggestions from Yosry (some of which have to do with processing large folios in ZSWAP_MAX_BATCH_SIZE increments to see if that resolves the regression), and from Herbert with regards to request chaining. I hope to share the results soon. Thanks, Kanchana > > > > > Hence, the above structure is maintained only for batched stores, and the > > following structure is implemented for sequential stores of large folio pages, > > that fixes the zstd regression, while preserving common code paths for > batched > > and sequential stores of a folio: > > > > Sequential stores: > > ------------------ > > For each page in folio: > > - allocate an entry, > > - compress the page, > > - store the entry in xarray/LRU. > > > > This is submitted as a separate patch only for code review purposes. I will > > squash this with the previous commit in subsequent versions of this > > patch-series. > > > > Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com> > > --- > > mm/zswap.c | 193 ++++++++++++++++++++++++++++++---------------------- > - > > 1 file changed, 111 insertions(+), 82 deletions(-) > > > > diff --git a/mm/zswap.c b/mm/zswap.c > > index f1cba77eda62..7bfc720a6201 100644 > > --- a/mm/zswap.c > > +++ b/mm/zswap.c > > @@ -1592,40 +1592,32 @@ static bool zswap_batch_compress(struct folio > *folio, > > return ret; > > } > > > > -static bool zswap_compress_folio(struct folio *folio, > > - struct zswap_entry *entries[], > > - struct zswap_pool *pool) > > +static __always_inline bool zswap_compress_folio(struct folio *folio, > > + struct zswap_entry *entries[], > > + long from_index, > > + struct zswap_pool *pool, > > + unsigned int batch_size, > > + struct crypto_acomp_ctx *acomp_ctx) > > { > > - long index, nr_pages = folio_nr_pages(folio); > > - struct crypto_acomp_ctx *acomp_ctx; > > - unsigned int batch_size; > > + long index = from_index, nr_pages = folio_nr_pages(folio); > > bool ret = true; > > > > - acomp_ctx = acomp_ctx_get_cpu_lock(pool); > > - batch_size = acomp_ctx->nr_reqs; > > - > > if ((batch_size > 1) && (nr_pages > 1)) { > > - for (index = 0; index < nr_pages; index += batch_size) { > > + for (; index < nr_pages; index += batch_size) { > > > > if (!zswap_batch_compress(folio, index, batch_size, > > &entries[index], pool, acomp_ctx)) { > > ret = false; > > - goto unlock_acomp_ctx; > > + break; > > } > > } > > } else { > > - for (index = 0; index < nr_pages; ++index) { > > - struct page *page = folio_page(folio, index); > > + struct page *page = folio_page(folio, index); > > > > - if (!zswap_compress(page, entries[index], pool, acomp_ctx)) { > > - ret = false; > > - goto unlock_acomp_ctx; > > - } > > - } > > + if (!zswap_compress(page, entries[index], pool, acomp_ctx)) > > + ret = false; > > } > > > > -unlock_acomp_ctx: > > - acomp_ctx_put_unlock(acomp_ctx); > > return ret; > > } > > > > @@ -1637,92 +1629,128 @@ static bool zswap_compress_folio(struct folio > *folio, > > * handles to ERR_PTR(-EINVAL) at allocation time, and the fact that the > > * entry's handle is subsequently modified only upon a successful > zpool_malloc() > > * after the page is compressed. > > + * > > + * For compressors that don't support batching, the following structure > > + * showed a performance regression with zstd/2M folios: > > + * > > + * Batched stores: > > + * --------------- > > + * - Allocate all entries, > > + * - Compress all entries, > > + * - Store all entries in xarray/LRU. > > + * > > + * Hence, the above structure is maintained only for batched stores, and > the > > + * following structure is implemented for sequential stores of large folio > pages, > > + * that fixes the regression, while preserving common code paths for > batched > > + * and sequential stores of a folio: > > + * > > + * Sequential stores: > > + * ------------------ > > + * For each page in folio: > > + * - allocate an entry, > > + * - compress the page, > > + * - store the entry in xarray/LRU. > > */ > > static bool zswap_store_folio(struct folio *folio, > > struct obj_cgroup *objcg, > > struct zswap_pool *pool) > > { > > - long index, from_index = 0, nr_pages = folio_nr_pages(folio); > > + long index = 0, from_index = 0, nr_pages, nr_folio_pages = > folio_nr_pages(folio); > > struct zswap_entry **entries = NULL; > > + struct crypto_acomp_ctx *acomp_ctx; > > int node_id = folio_nid(folio); > > + unsigned int batch_size; > > > > - entries = kmalloc(nr_pages * sizeof(*entries), GFP_KERNEL); > > + entries = kmalloc(nr_folio_pages * sizeof(*entries), GFP_KERNEL); > > if (!entries) > > return false; > > > > - for (index = 0; index < nr_pages; ++index) { > > - entries[index] = zswap_entry_cache_alloc(GFP_KERNEL, node_id); > > + acomp_ctx = acomp_ctx_get_cpu_lock(pool); > > + batch_size = acomp_ctx->nr_reqs; > > > > - if (!entries[index]) { > > - zswap_reject_kmemcache_fail++; > > - nr_pages = index; > > - goto store_folio_failed; > > + nr_pages = (batch_size > 1) ? nr_folio_pages : 1; > > + > > + while (1) { > > + for (index = from_index; index < nr_pages; ++index) { > > + entries[index] = zswap_entry_cache_alloc(GFP_KERNEL, > node_id); > > + > > + if (!entries[index]) { > > + zswap_reject_kmemcache_fail++; > > + nr_pages = index; > > + goto store_folio_failed; > > + } > > + > > + entries[index]->handle = (unsigned long)ERR_PTR(-EINVAL); > > } > > > > - entries[index]->handle = (unsigned long)ERR_PTR(-EINVAL); > > - } > > + if (!zswap_compress_folio(folio, entries, from_index, pool, > batch_size, acomp_ctx)) > > + goto store_folio_failed; > > > > - if (!zswap_compress_folio(folio, entries, pool)) > > - goto store_folio_failed; > > + for (index = from_index; index < nr_pages; ++index) { > > + swp_entry_t page_swpentry = > page_swap_entry(folio_page(folio, index)); > > + struct zswap_entry *old, *entry = entries[index]; > > > > - for (index = 0; index < nr_pages; ++index) { > > - swp_entry_t page_swpentry = page_swap_entry(folio_page(folio, > index)); > > - struct zswap_entry *old, *entry = entries[index]; > > + old = xa_store(swap_zswap_tree(page_swpentry), > > + swp_offset(page_swpentry), > > + entry, GFP_KERNEL); > > + if (xa_is_err(old)) { > > + int err = xa_err(old); > > > > - old = xa_store(swap_zswap_tree(page_swpentry), > > - swp_offset(page_swpentry), > > - entry, GFP_KERNEL); > > - if (xa_is_err(old)) { > > - int err = xa_err(old); > > + WARN_ONCE(err != -ENOMEM, "unexpected xarray > error: %d\n", err); > > + zswap_reject_alloc_fail++; > > + from_index = index; > > + goto store_folio_failed; > > + } > > > > - WARN_ONCE(err != -ENOMEM, "unexpected xarray error: > %d\n", err); > > - zswap_reject_alloc_fail++; > > - from_index = index; > > - goto store_folio_failed; > > - } > > + /* > > + * We may have had an existing entry that became stale when > > + * the folio was redirtied and now the new version is being > > + * swapped out. Get rid of the old. > > + */ > > + if (old) > > + zswap_entry_free(old); > > > > - /* > > - * We may have had an existing entry that became stale when > > - * the folio was redirtied and now the new version is being > > - * swapped out. Get rid of the old. > > - */ > > - if (old) > > - zswap_entry_free(old); > > + /* > > + * The entry is successfully compressed and stored in the tree, > there is > > + * no further possibility of failure. Grab refs to the pool and > objcg, > > + * charge zswap memory, and increment > zswap_stored_pages. > > + * The opposite actions will be performed by > zswap_entry_free() > > + * when the entry is removed from the tree. > > + */ > > + zswap_pool_get(pool); > > + if (objcg) { > > + obj_cgroup_get(objcg); > > + obj_cgroup_charge_zswap(objcg, entry->length); > > + } > > + atomic_long_inc(&zswap_stored_pages); > > > > - /* > > - * The entry is successfully compressed and stored in the tree, > there is > > - * no further possibility of failure. Grab refs to the pool and objcg, > > - * charge zswap memory, and increment zswap_stored_pages. > > - * The opposite actions will be performed by zswap_entry_free() > > - * when the entry is removed from the tree. > > - */ > > - zswap_pool_get(pool); > > - if (objcg) { > > - obj_cgroup_get(objcg); > > - obj_cgroup_charge_zswap(objcg, entry->length); > > + /* > > + * We finish initializing the entry while it's already in xarray. > > + * This is safe because: > > + * > > + * 1. Concurrent stores and invalidations are excluded by folio > lock. > > + * > > + * 2. Writeback is excluded by the entry not being on the LRU > yet. > > + * The publishing order matters to prevent writeback from > seeing > > + * an incoherent entry. > > + */ > > + entry->pool = pool; > > + entry->swpentry = page_swpentry; > > + entry->objcg = objcg; > > + entry->referenced = true; > > + if (entry->length) { > > + INIT_LIST_HEAD(&entry->lru); > > + zswap_lru_add(&zswap_list_lru, entry); > > + } > > } > > - atomic_long_inc(&zswap_stored_pages); > > > > - /* > > - * We finish initializing the entry while it's already in xarray. > > - * This is safe because: > > - * > > - * 1. Concurrent stores and invalidations are excluded by folio lock. > > - * > > - * 2. Writeback is excluded by the entry not being on the LRU yet. > > - * The publishing order matters to prevent writeback from seeing > > - * an incoherent entry. > > - */ > > - entry->pool = pool; > > - entry->swpentry = page_swpentry; > > - entry->objcg = objcg; > > - entry->referenced = true; > > - if (entry->length) { > > - INIT_LIST_HEAD(&entry->lru); > > - zswap_lru_add(&zswap_list_lru, entry); > > - } > > + from_index = nr_pages++; > > + > > + if (nr_pages > nr_folio_pages) > > + break; > > } > > > > + acomp_ctx_put_unlock(acomp_ctx); > > kfree(entries); > > return true; > > > > @@ -1734,6 +1762,7 @@ static bool zswap_store_folio(struct folio *folio, > > zswap_entry_cache_free(entries[index]); > > } > > > > + acomp_ctx_put_unlock(acomp_ctx); > > kfree(entries); > > return false; > > } > > -- > > 2.27.0 > >
Hi Yosry, Nhat, > -----Original Message----- > From: Yosry Ahmed <yosry.ahmed@linux.dev> > Sent: Thursday, February 6, 2025 11:15 AM > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; > hannes@cmpxchg.org; nphamcs@gmail.com; chengming.zhou@linux.dev; > usamaarif642@gmail.com; ryan.roberts@arm.com; 21cnbao@gmail.com; > akpm@linux-foundation.org; linux-crypto@vger.kernel.org; > herbert@gondor.apana.org.au; davem@davemloft.net; > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com; > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>; > Feghali, Wajdi K <wajdi.k.feghali@intel.com>; Gopal, Vinodh > <vinodh.gopal@intel.com> > Subject: Re: [PATCH v6 16/16] mm: zswap: Fix for zstd performance > regression with 2M folios. > > On Wed, Feb 05, 2025 at 11:21:02PM -0800, Kanchana P Sridhar wrote: > > With the previous patch that enables support for batch compressions in > > zswap_compress_folio(), a 6.2% throughput regression was seen with zstd > and > > 2M folios, using vm-scalability/usemem. > > > > For compressors that don't support batching, this was root-caused to the > > following zswap_store_folio() structure: > > > > Batched stores: > > --------------- > > - Allocate all entries, > > - Compress all entries, > > - Store all entries in xarray/LRU. > > > > Hence, the above structure is maintained only for batched stores, and the > > following structure is implemented for sequential stores of large folio pages, > > that fixes the zstd regression, while preserving common code paths for > batched > > and sequential stores of a folio: > > > > Sequential stores: > > ------------------ > > For each page in folio: > > - allocate an entry, > > - compress the page, > > - store the entry in xarray/LRU. > > > > This is submitted as a separate patch only for code review purposes. I will > > squash this with the previous commit in subsequent versions of this > > patch-series. > > Could it be the cache locality? It is possible, since there are more LLC-store-misses in v6, patch 15 (that has the regression) as compared to patch v6 16 (that fixes the regression). > > I wonder if we should do what Chengming initially suggested and batch > everything at ZSWAP_MAX_BATCH_SIZE instead. Instead of > zswap_compress_folio() operating on the entire folio, we can operate on > batches of size ZSWAP_MAX_BATCH_SIZE, regardless of whether the > underlying compressor supports batching. > > If we do this, instead of: > - Allocate all entries > - Compress all entries > - Store all entries > > We can do: > - For each batch (8 entries) > - Allocate all entries > - Compress all entries > - Store all entries > > This should help unify the code, and I suspect it may also fix the zstd > regression. We can also skip the entries array allocation and use one on > the stack. I tried the above, and additional approaches to unify the code, and all of them cause a regression in zstd throughput and latency in the usemem 30 processes experiments: 1) Operate on batches of size ZSWAP_MAX_BATCH_SIZE, regardless of whether the underlying compressor supports batching. I tried this without and with upfront allocation of all entries. 2) Unify post-compression zpool malloc, map handle, memcpy computes done in zswap_compress() and zswap_batch_compress() by moving them to a procedure and calling this from both these functions. Tried without and with inlining this procedure. 3) With the v7 changes to make the acomp_ctx resources' lifetime be from pool creation to pool deletion, I wanted to make another functional change in acomp_ctx_get_cpu_lock(), as described in the code comments in v7. However, I found that making just this change in zswap_decompress() caused zstd regression: do { acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); } while (!acomp_ctx); /* * If the CPU onlining code successfully allocates acomp_ctx resources, * it sets acomp_ctx->initialized to true. Until this happens, we have * two options: * * 1. Return NULL and fail all stores on this CPU. * 2. Retry, until onlining has finished allocating resources. * * In theory, option 1 could be more appropriate, because it * allows the calling procedure to decide how it wants to handle * reclaim racing with CPU hotplug. For instance, it might be Ok * for compress to return an error for the backing swap device * to store the folio. Decompress could wait until we get a * valid and locked mutex after onlining has completed. For now, * we go with option 2 because adding a do-while in * zswap_decompress() adds latency for software compressors. */ My main learning from these attempts to try alternate approaches to resolve the zstd regression and to unify common computes into distinct procedures that zswap_compress() and zswap_batch_compress() will each call, is this: zstd is extremely sensitive to any and all compute overheads. The above approaches yield approx.. these zstd latencies with 64K folios quite consistently (I will summarize the mm-unstable data in v7): Total throughput (KB/s) 6,909,838 Average throughput (KB/s) 230,327 elapsed time (sec) 94.58 sys time (sec) 2,358.36 The v7 implementation of zswap_store_folios() consistently gives better latency: Total throughput (KB/s) 6,939,253 Average throughput (KB/s) 231,308 elapsed time (sec) 88.64 sys time (sec) 2,197.54 My main goal in v7 has been to not regress zstd by introducing compute overheads, while deriving the benefits of batching with request chaining, while trying to preserve the v6 structure of zswap_store_folio() that aims to strike a balance between: 1) Common code for batching and sequential. 2) Making that common code "behave" sequentially so that zstd does not see a latency regression. I would appreciate it if you can review the v7 implementation and let me know if this would be acceptable. Thanks, Kanchana
diff --git a/mm/zswap.c b/mm/zswap.c index f1cba77eda62..7bfc720a6201 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1592,40 +1592,32 @@ static bool zswap_batch_compress(struct folio *folio, return ret; } -static bool zswap_compress_folio(struct folio *folio, - struct zswap_entry *entries[], - struct zswap_pool *pool) +static __always_inline bool zswap_compress_folio(struct folio *folio, + struct zswap_entry *entries[], + long from_index, + struct zswap_pool *pool, + unsigned int batch_size, + struct crypto_acomp_ctx *acomp_ctx) { - long index, nr_pages = folio_nr_pages(folio); - struct crypto_acomp_ctx *acomp_ctx; - unsigned int batch_size; + long index = from_index, nr_pages = folio_nr_pages(folio); bool ret = true; - acomp_ctx = acomp_ctx_get_cpu_lock(pool); - batch_size = acomp_ctx->nr_reqs; - if ((batch_size > 1) && (nr_pages > 1)) { - for (index = 0; index < nr_pages; index += batch_size) { + for (; index < nr_pages; index += batch_size) { if (!zswap_batch_compress(folio, index, batch_size, &entries[index], pool, acomp_ctx)) { ret = false; - goto unlock_acomp_ctx; + break; } } } else { - for (index = 0; index < nr_pages; ++index) { - struct page *page = folio_page(folio, index); + struct page *page = folio_page(folio, index); - if (!zswap_compress(page, entries[index], pool, acomp_ctx)) { - ret = false; - goto unlock_acomp_ctx; - } - } + if (!zswap_compress(page, entries[index], pool, acomp_ctx)) + ret = false; } -unlock_acomp_ctx: - acomp_ctx_put_unlock(acomp_ctx); return ret; } @@ -1637,92 +1629,128 @@ static bool zswap_compress_folio(struct folio *folio, * handles to ERR_PTR(-EINVAL) at allocation time, and the fact that the * entry's handle is subsequently modified only upon a successful zpool_malloc() * after the page is compressed. + * + * For compressors that don't support batching, the following structure + * showed a performance regression with zstd/2M folios: + * + * Batched stores: + * --------------- + * - Allocate all entries, + * - Compress all entries, + * - Store all entries in xarray/LRU. + * + * Hence, the above structure is maintained only for batched stores, and the + * following structure is implemented for sequential stores of large folio pages, + * that fixes the regression, while preserving common code paths for batched + * and sequential stores of a folio: + * + * Sequential stores: + * ------------------ + * For each page in folio: + * - allocate an entry, + * - compress the page, + * - store the entry in xarray/LRU. */ static bool zswap_store_folio(struct folio *folio, struct obj_cgroup *objcg, struct zswap_pool *pool) { - long index, from_index = 0, nr_pages = folio_nr_pages(folio); + long index = 0, from_index = 0, nr_pages, nr_folio_pages = folio_nr_pages(folio); struct zswap_entry **entries = NULL; + struct crypto_acomp_ctx *acomp_ctx; int node_id = folio_nid(folio); + unsigned int batch_size; - entries = kmalloc(nr_pages * sizeof(*entries), GFP_KERNEL); + entries = kmalloc(nr_folio_pages * sizeof(*entries), GFP_KERNEL); if (!entries) return false; - for (index = 0; index < nr_pages; ++index) { - entries[index] = zswap_entry_cache_alloc(GFP_KERNEL, node_id); + acomp_ctx = acomp_ctx_get_cpu_lock(pool); + batch_size = acomp_ctx->nr_reqs; - if (!entries[index]) { - zswap_reject_kmemcache_fail++; - nr_pages = index; - goto store_folio_failed; + nr_pages = (batch_size > 1) ? nr_folio_pages : 1; + + while (1) { + for (index = from_index; index < nr_pages; ++index) { + entries[index] = zswap_entry_cache_alloc(GFP_KERNEL, node_id); + + if (!entries[index]) { + zswap_reject_kmemcache_fail++; + nr_pages = index; + goto store_folio_failed; + } + + entries[index]->handle = (unsigned long)ERR_PTR(-EINVAL); } - entries[index]->handle = (unsigned long)ERR_PTR(-EINVAL); - } + if (!zswap_compress_folio(folio, entries, from_index, pool, batch_size, acomp_ctx)) + goto store_folio_failed; - if (!zswap_compress_folio(folio, entries, pool)) - goto store_folio_failed; + for (index = from_index; index < nr_pages; ++index) { + swp_entry_t page_swpentry = page_swap_entry(folio_page(folio, index)); + struct zswap_entry *old, *entry = entries[index]; - for (index = 0; index < nr_pages; ++index) { - swp_entry_t page_swpentry = page_swap_entry(folio_page(folio, index)); - struct zswap_entry *old, *entry = entries[index]; + old = xa_store(swap_zswap_tree(page_swpentry), + swp_offset(page_swpentry), + entry, GFP_KERNEL); + if (xa_is_err(old)) { + int err = xa_err(old); - old = xa_store(swap_zswap_tree(page_swpentry), - swp_offset(page_swpentry), - entry, GFP_KERNEL); - if (xa_is_err(old)) { - int err = xa_err(old); + WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err); + zswap_reject_alloc_fail++; + from_index = index; + goto store_folio_failed; + } - WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err); - zswap_reject_alloc_fail++; - from_index = index; - goto store_folio_failed; - } + /* + * We may have had an existing entry that became stale when + * the folio was redirtied and now the new version is being + * swapped out. Get rid of the old. + */ + if (old) + zswap_entry_free(old); - /* - * We may have had an existing entry that became stale when - * the folio was redirtied and now the new version is being - * swapped out. Get rid of the old. - */ - if (old) - zswap_entry_free(old); + /* + * The entry is successfully compressed and stored in the tree, there is + * no further possibility of failure. Grab refs to the pool and objcg, + * charge zswap memory, and increment zswap_stored_pages. + * The opposite actions will be performed by zswap_entry_free() + * when the entry is removed from the tree. + */ + zswap_pool_get(pool); + if (objcg) { + obj_cgroup_get(objcg); + obj_cgroup_charge_zswap(objcg, entry->length); + } + atomic_long_inc(&zswap_stored_pages); - /* - * The entry is successfully compressed and stored in the tree, there is - * no further possibility of failure. Grab refs to the pool and objcg, - * charge zswap memory, and increment zswap_stored_pages. - * The opposite actions will be performed by zswap_entry_free() - * when the entry is removed from the tree. - */ - zswap_pool_get(pool); - if (objcg) { - obj_cgroup_get(objcg); - obj_cgroup_charge_zswap(objcg, entry->length); + /* + * We finish initializing the entry while it's already in xarray. + * This is safe because: + * + * 1. Concurrent stores and invalidations are excluded by folio lock. + * + * 2. Writeback is excluded by the entry not being on the LRU yet. + * The publishing order matters to prevent writeback from seeing + * an incoherent entry. + */ + entry->pool = pool; + entry->swpentry = page_swpentry; + entry->objcg = objcg; + entry->referenced = true; + if (entry->length) { + INIT_LIST_HEAD(&entry->lru); + zswap_lru_add(&zswap_list_lru, entry); + } } - atomic_long_inc(&zswap_stored_pages); - /* - * We finish initializing the entry while it's already in xarray. - * This is safe because: - * - * 1. Concurrent stores and invalidations are excluded by folio lock. - * - * 2. Writeback is excluded by the entry not being on the LRU yet. - * The publishing order matters to prevent writeback from seeing - * an incoherent entry. - */ - entry->pool = pool; - entry->swpentry = page_swpentry; - entry->objcg = objcg; - entry->referenced = true; - if (entry->length) { - INIT_LIST_HEAD(&entry->lru); - zswap_lru_add(&zswap_list_lru, entry); - } + from_index = nr_pages++; + + if (nr_pages > nr_folio_pages) + break; } + acomp_ctx_put_unlock(acomp_ctx); kfree(entries); return true; @@ -1734,6 +1762,7 @@ static bool zswap_store_folio(struct folio *folio, zswap_entry_cache_free(entries[index]); } + acomp_ctx_put_unlock(acomp_ctx); kfree(entries); return false; }
With the previous patch that enables support for batch compressions in zswap_compress_folio(), a 6.2% throughput regression was seen with zstd and 2M folios, using vm-scalability/usemem. For compressors that don't support batching, this was root-caused to the following zswap_store_folio() structure: Batched stores: --------------- - Allocate all entries, - Compress all entries, - Store all entries in xarray/LRU. Hence, the above structure is maintained only for batched stores, and the following structure is implemented for sequential stores of large folio pages, that fixes the zstd regression, while preserving common code paths for batched and sequential stores of a folio: Sequential stores: ------------------ For each page in folio: - allocate an entry, - compress the page, - store the entry in xarray/LRU. This is submitted as a separate patch only for code review purposes. I will squash this with the previous commit in subsequent versions of this patch-series. Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com> --- mm/zswap.c | 193 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 111 insertions(+), 82 deletions(-)