diff mbox series

crypto: x86: Do not acquire fpu context for too long

Message ID 20221004044912.24770-1-ap420073@gmail.com
State New
Headers show
Series crypto: x86: Do not acquire fpu context for too long | expand

Commit Message

Taehee Yoo Oct. 4, 2022, 4:49 a.m. UTC
Many drivers such as camellia, cast5, etc are using ECB macro.
ECB macro internally calls kernel_fpu_begin() then calls encrypt()
and decrypt() and then it calls kernel_fpu_end().
If too many blocks are given, it acquires fpu context for too long.

kernel_fpu_{begin | end}() internally calls preempt_{disable | enable}().
So, RCU stall would occur because of too long FPU context
period.

The purpose of this is to not exceed 4096 bytes to encrypt() and
decrypt() in an FPU context.

Fixes: 827ee47228a6 ("crypto: x86 - add some helper macros for ECB and CBC modes")
Suggested-by: Elliott, Robert (Servers) <elliott@hpe.com>
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---
 arch/x86/crypto/ecb_cbc_helpers.h | 34 ++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

Comments

Ard Biesheuvel Oct. 7, 2022, 9:54 p.m. UTC | #1
On Tue, 4 Oct 2022 at 17:17, Elliott, Robert (Servers) <elliott@hpe.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Taehee Yoo <ap420073@gmail.com>
> > Sent: Tuesday, October 4, 2022 1:02 AM
> > To: Herbert Xu <herbert@gondor.apana.org.au>
> > Cc: linux-crypto@vger.kernel.org; davem@davemloft.net; tglx@linutronix.de;
> > mingo@redhat.com; bp@alien8.de; dave.hansen@linux.intel.com; x86@kernel.org;
> > hpa@zytor.com; ardb@kernel.org; ebiggers@google.com
> > Subject: Re: [PATCH] crypto: x86: Do not acquire fpu context for too long
> >
> > Hi Herbert,
> > Thanks a lot for your review!
> >
> > On 10/4/22 13:52, Herbert Xu wrote:
> >  > On Tue, Oct 04, 2022 at 04:49:12AM +0000, Taehee Yoo wrote:
> >  >>
> >  >>   #define ECB_WALK_START(req, bsize, fpu_blocks) do {                     \
> >  >>           void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));   \
> >  >> + unsigned int walked_bytes = 0;                                  \
> >  >>           const int __bsize = (bsize);                                    \
> >  >>           struct skcipher_walk walk;                                      \
> >  >> - int err = skcipher_walk_virt(&walk, (req), false);              \
> >  >> + int err;                                                        \
> >  >> +                                                                 \
> >  >> + err = skcipher_walk_virt(&walk, (req), false);                  \
> >  >>           while (walk.nbytes > 0) {                                       \
> >  >> -         unsigned int nbytes = walk.nbytes;                      \
> >  >> -         bool do_fpu = (fpu_blocks) != -1 &&                     \
> >  >> -                       nbytes >= (fpu_blocks) * __bsize;         \
> >  >>                   const u8 *src = walk.src.virt.addr;                     \
> >  >> -         u8 *dst = walk.dst.virt.addr;                           \
> >  >>                   u8 __maybe_unused buf[(bsize)];                         \
> >  >> -         if (do_fpu) kernel_fpu_begin()
> >  >> +         u8 *dst = walk.dst.virt.addr;                           \
> >  >> +         unsigned int nbytes;                                    \
> >  >> +         bool do_fpu;                                            \
> >  >> +                                                                 \
> >  >> +         if (walk.nbytes - walked_bytes > ECB_CBC_WALK_MAX) {    \
> >  >> +                 nbytes = ECB_CBC_WALK_MAX;                      \
> >  >> +                 walked_bytes += ECB_CBC_WALK_MAX;               \
> >  >> +         } else {                                                \
> >  >> +                 nbytes = walk.nbytes - walked_bytes;            \
> >  >> +                 walked_bytes = walk.nbytes;                     \
> >  >> +         }                                                       \
> >  >> +                                                                 \
> >  >> +         do_fpu = (fpu_blocks) != -1 &&                          \
> >  >> +                  nbytes >= (fpu_blocks) * __bsize;              \
> >  >> +         if (do_fpu)                                             \
> >  >> +                 kernel_fpu_begin()
> >  >>
> >  >>   #define CBC_WALK_START(req, bsize, fpu_blocks)                          \
> >  >>           ECB_WALK_START(req, bsize, fpu_blocks)
> >  >> @@ -65,8 +81,12 @@
> >  >>   } while (0)
> >  >>
> >  >>   #define ECB_WALK_END()                                                  \
> >  >> -         if (do_fpu) kernel_fpu_end();                           \
> >  >> +         if (do_fpu)                                             \
> >  >> +                 kernel_fpu_end();                               \
> >  >> +         if (walked_bytes < walk.nbytes)                         \
> >  >> +                 continue;                                       \
> >  >>                   err = skcipher_walk_done(&walk, nbytes);                \
> >  >> +         walked_bytes = 0;                                       \
> >  >>           }                                                               \
> >  >>           return err;                                                     \
> >  >>   } while (0)
> >  >
> >  > skcipher_walk_* is supposed to return at most a page.  Why is this
> >  > necessary?
> >  >
> >  > Cheers,
> >
> > I referred to below link.
> > https://lore.kernel.org/all/MW5PR84MB18426EBBA3303770A8BC0BDFAB759@MW5PR84MB18
> > 42.NAMPRD84.PROD.OUTLOOK.COM/
> >
> > Sorry for that I didn't check that skcipher_walk_* returns only under 4K
> > sizes.
> > So, I thought fpu context would be too long.
> > But, I just checked the skcipher_walk_*, and it's right, it returns
> > under 4K sizes.
> > So, there are no problems as you mentioned.
> >
> > Thank you so much!
> > Taehee Yoo
>
> I think functions using the ECB and CBC macros (and
> those helper functions) are safe  - notice the third
> argument is called fpu_blocks. So, Aria's ECB mode is
> probably safe. There are no CTR macros, so that needs
> to be checked more carefully.
>
> We need to check all the functions that don't use the
> macros and functions. The SHA functions (I've started
> working on a patch for those).
>
> Running modprobe tcrypt mode=0, I encountered RCU stalls
> on these:
>         tcrypt: testing encryption speed of sync skcipher cts(cbc(aes)) using cts(cbc(aes-aesni))
>         tcrypt: testing encryption speed of sync skcipher cfb(aes) using cfb(aes-aesni)
>
> aesni-intel_glue.c registers "__cts(cbs(aes))", not "cts(cbc(aes-aesni)",
> and doesn't register any cfb algorithms, so those tests are using the
> generic templates, which must not be breaking up the loops as needed.
>

These all use the aes-aesni cipher wrapped in various layers of
generic code. The core cipher puts kernel_fpu_begin/end around every
AES block {16 bytes) it processes, so I doubt that the crypto code is
at fault here, unless the issues is in tcrypt itself.
Elliott, Robert (Servers) Oct. 8, 2022, 7:48 p.m. UTC | #2
> -----Original Message-----
> From: Ard Biesheuvel <ardb@kernel.org>
> Sent: Friday, October 7, 2022 4:55 PM
> To: Elliott, Robert (Servers) <elliott@hpe.com>
> Cc: Taehee Yoo <ap420073@gmail.com>; Herbert Xu <herbert@gondor.apana.org.au>;
> linux-crypto@vger.kernel.org; davem@davemloft.net; tglx@linutronix.de;
> mingo@redhat.com; bp@alien8.de; dave.hansen@linux.intel.com; x86@kernel.org;
> hpa@zytor.com; ebiggers@google.com
> Subject: Re: [PATCH] crypto: x86: Do not acquire fpu context for too long
> 
> On Tue, 4 Oct 2022 at 17:17, Elliott, Robert (Servers) <elliott@hpe.com>
> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Taehee Yoo <ap420073@gmail.com>
> > > Sent: Tuesday, October 4, 2022 1:02 AM
> > > To: Herbert Xu <herbert@gondor.apana.org.au>
> > > Cc: linux-crypto@vger.kernel.org; davem@davemloft.net; tglx@linutronix.de;
> > > mingo@redhat.com; bp@alien8.de; dave.hansen@linux.intel.com;
> x86@kernel.org;
> > > hpa@zytor.com; ardb@kernel.org; ebiggers@google.com
> > > Subject: Re: [PATCH] crypto: x86: Do not acquire fpu context for too long
> > >
> > > Hi Herbert,
> > > Thanks a lot for your review!
> > >
> > > On 10/4/22 13:52, Herbert Xu wrote:
> > >  > On Tue, Oct 04, 2022 at 04:49:12AM +0000, Taehee Yoo wrote:
> > >  >>
> > >  >>   #define ECB_WALK_START(req, bsize, fpu_blocks) do {
> \
> > >  >>           void *ctx =
> crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));   \
> > >  >> + unsigned int walked_bytes = 0;                                  \
> > >  >>           const int __bsize = (bsize);
> \
> > >  >>           struct skcipher_walk walk;
> \
> > >  >> - int err = skcipher_walk_virt(&walk, (req), false);              \
> > >  >> + int err;                                                        \
> > >  >> +                                                                 \
> > >  >> + err = skcipher_walk_virt(&walk, (req), false);                  \
> > >  >>           while (walk.nbytes > 0) {
> \
> > >  >> -         unsigned int nbytes = walk.nbytes;                      \
> > >  >> -         bool do_fpu = (fpu_blocks) != -1 &&                     \
> > >  >> -                       nbytes >= (fpu_blocks) * __bsize;         \
> > >  >>                   const u8 *src = walk.src.virt.addr;
> \
> > >  >> -         u8 *dst = walk.dst.virt.addr;                           \
> > >  >>                   u8 __maybe_unused buf[(bsize)];
> \
> > >  >> -         if (do_fpu) kernel_fpu_begin()
> > >  >> +         u8 *dst = walk.dst.virt.addr;                           \
> > >  >> +         unsigned int nbytes;                                    \
> > >  >> +         bool do_fpu;                                            \
> > >  >> +                                                                 \
> > >  >> +         if (walk.nbytes - walked_bytes > ECB_CBC_WALK_MAX) {    \
> > >  >> +                 nbytes = ECB_CBC_WALK_MAX;                      \
> > >  >> +                 walked_bytes += ECB_CBC_WALK_MAX;               \
> > >  >> +         } else {                                                \
> > >  >> +                 nbytes = walk.nbytes - walked_bytes;            \
> > >  >> +                 walked_bytes = walk.nbytes;                     \
> > >  >> +         }                                                       \
> > >  >> +                                                                 \
> > >  >> +         do_fpu = (fpu_blocks) != -1 &&                          \
> > >  >> +                  nbytes >= (fpu_blocks) * __bsize;              \
> > >  >> +         if (do_fpu)                                             \
> > >  >> +                 kernel_fpu_begin()
> > >  >>
> > >  >>   #define CBC_WALK_START(req, bsize, fpu_blocks)
> \
> > >  >>           ECB_WALK_START(req, bsize, fpu_blocks)
> > >  >> @@ -65,8 +81,12 @@
> > >  >>   } while (0)
> > >  >>
> > >  >>   #define ECB_WALK_END()
> \
> > >  >> -         if (do_fpu) kernel_fpu_end();                           \
> > >  >> +         if (do_fpu)                                             \
> > >  >> +                 kernel_fpu_end();                               \
> > >  >> +         if (walked_bytes < walk.nbytes)                         \
> > >  >> +                 continue;                                       \
> > >  >>                   err = skcipher_walk_done(&walk, nbytes);
> \
> > >  >> +         walked_bytes = 0;                                       \
> > >  >>           }
> \
> > >  >>           return err;
> \
> > >  >>   } while (0)
> > >  >
> > >  > skcipher_walk_* is supposed to return at most a page.  Why is this
> > >  > necessary?
> > >  >
> > >  > Cheers,
> > >
> > > I referred to below link.
> > >
> https://lore.kernel.org/all/MW5PR84MB18426EBBA3303770A8BC0BDFAB759@MW5PR84MB18
> > > 42.NAMPRD84.PROD.OUTLOOK.COM/
> > >
> > > Sorry for that I didn't check that skcipher_walk_* returns only under 4K
> > > sizes.
> > > So, I thought fpu context would be too long.
> > > But, I just checked the skcipher_walk_*, and it's right, it returns
> > > under 4K sizes.
> > > So, there are no problems as you mentioned.
> > >
> > > Thank you so much!
> > > Taehee Yoo
> >
> > I think functions using the ECB and CBC macros (and
> > those helper functions) are safe  - notice the third
> > argument is called fpu_blocks. So, Aria's ECB mode is
> > probably safe. There are no CTR macros, so that needs
> > to be checked more carefully.
> >
> > We need to check all the functions that don't use the
> > macros and functions. The SHA functions (I've started
> > working on a patch for those).
> >
> > Running modprobe tcrypt mode=0, I encountered RCU stalls
> > on these:
> >         tcrypt: testing encryption speed of sync skcipher cts(cbc(aes))
> using cts(cbc(aes-aesni))
> >         tcrypt: testing encryption speed of sync skcipher cfb(aes) using
> cfb(aes-aesni)
> >
> > aesni-intel_glue.c registers "__cts(cbs(aes))", not "cts(cbc(aes-aesni)",
> > and doesn't register any cfb algorithms, so those tests are using the
> > generic templates, which must not be breaking up the loops as needed.
> >
> 
> These all use the aes-aesni cipher wrapped in various layers of
> generic code. The core cipher puts kernel_fpu_begin/end around every
> AES block {16 bytes) it processes, so I doubt that the crypto code is
> at fault here, unless the issues is in tcrypt itself.

In 2018, commit 2af632996b89 ("crypto: tcrypt - reschedule during
speed tests") added cond_resched() calls to "Avoid RCU stalls in
the case of non-preemptible kernel and lengthy speed tests by
rescheduling when advancing from one block size to another."

It only makes those calls if the sec module parameter is used
(run the speed test for a certain number of seconds), not the
default "cycles" mode.

    if (secs) {
        ret = test_mb_acipher_jiffies(data, enc, bs, secs, num_mb);
        cond_resched();
    } else {
        ret = test_mb_acipher_cycles(data, enc, bs, num_mb);
    }

In the original sighting, all three occurred during mode 200.

The cycle counts range from 151 to 221840 clock cycles. The
stalls do not correlate to the longer cycle counts.

The first 8 tests were fine:

Aug 11 18:55:06 adevxp033-sys unknown: Running modprobe tcrypt mode=200
tcrypt: testing encryption speed of sync skcipher ecb(aes) using ecb(aes-aesni)
tcrypt: testing decryption speed of sync skcipher ecb(aes) using ecb(aes-aesni)
tcrypt: testing encryption speed of sync skcipher cbc(aes) using cbc(aes-aesni)
tcrypt: testing decryption speed of sync skcipher cbc(aes) using cbc(aes-aesni)
tcrypt: testing encryption speed of sync skcipher lrw(aes) using lrw(ecb(aes-aesni))
tcrypt: testing decryption speed of sync skcipher lrw(aes) using lrw(ecb(aes-aesni))
tcrypt: testing encryption speed of sync skcipher xts(aes) using xts(ecb(aes-aesni))
tcrypt: testing decryption speed of sync skcipher xts(aes) using xts(ecb(aes-aesni))

but the 9th triggered the first stall warning:
tcrypt: testing encryption speed of sync skcipher cts(cbc(aes)) using cts(cbc(aes-aesni))
tcrypt: test 0 (128 bit key, 16 byte blocks): 1 operation in 243 cycles (16 bytes)
tcrypt: test 1 (128 bit key, 64 byte blocks): 1 operation in 728 cycles (64 bytes)
tcrypt: test 2 (128 bit key, 128 byte blocks): 1 operation in 939 cycles (128 bytes)
tcrypt: test 3 (128 bit key, 256 byte blocks): 1 operation in 1341 cycles (256 bytes)
tcrypt: test 4 (128 bit key, 1024 byte blocks): 1 operation in 3786 cycles (1024 bytes)
tcrypt: test 5 (128 bit key, 1424 byte blocks): 1 operation in 5064 cycles (1424 bytes)
tcrypt: test 6 (128 bit key, 4096 byte blocks): 1 operation in 13889 cycles (4096 bytes)
tcrypt: test 7 (192 bit key, 16 byte blocks): 1 operation in 236 cycles (16 bytes)
rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: {
1 operation in 745 cycles (64 bytes)

The stack trace prints interspersed with the next 26 test prints,
as it moved into:
tcrypt: testing decryption speed of sync skcipher cts(cbc(aes)) using cts(cbc(aes-aesni))

It proceeded through the next two with no problem:
tcrypt: testing encryption speed of sync skcipher ctr(aes) using ctr(aes-aesni)
tcrypt: testing decryption speed of sync skcipher ctr(aes) using ctr(aes-aesni)

and then reported two stalls during cfb encryption and cfb decryption:
tcrypt: testing encryption speed of sync skcipher cfb(aes) using cfb(aes-aesni)
tcrypt: test 0 (128 bit key, 16 byte blocks): 1 operation in 182 cycles (16 bytes)
tcrypt: test 1 (128 bit key, 64 byte blocks): 1 operation in 347 cycles (64 bytes)
tcrypt: test 2 (128 bit key, 128 byte blocks): 1 operation in 542 cycles (128 bytes)
tcrypt: test 3 (128 bit key, 256 byte blocks): 1 operation in 963 cycles (256 bytes)
rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: {
1 operation in 3465 cycles (1024 bytes)
tcrypt: test 5 (128 bit key, 1420 byte blocks): 1 operation in 4808 cycles (1420 bytes)
tcrypt: test 6 (128 bit key, 4096 byte blocks): 1 operation in 13631 cycles (4096 bytes)
tcrypt: test 7 (192 bit key, 16 byte blocks): 1 operation in 190 cycles (16 bytes)
tcrypt: test 8 (192 bit key, 64 byte blocks): 1 operation in 360 cycles (64 bytes)
tcrypt: test 9 (192 bit key, 128 byte blocks): 1 operation in 570 cycles (128 bytes)
tcrypt: test 10 (192 bit key, 256 byte blocks): 1 operation in 1010 cycles (256 bytes)
tcrypt: test 11 (192 bit key, 1024 byte blocks): 1 operation in 3660 cycles (1024 bytes)
tcrypt: test 12 (192 bit key, 1420 byte blocks): 1 operation in 5088 cycles (1420 bytes)
tcrypt: test 13 (192 bit key, 4096 byte blocks): 1 operation in 14781 cycles (4096 bytes)
tcrypt: test 14 (256 bit key, 16 byte blocks): 1 operation in 195 cycles (16 bytes)
tcrypt: test 15 (256 bit key, 64 byte blocks): 1 operation in 363 cycles (64 bytes)
tcrypt: test 16 (256 bit key, 128 byte blocks): 1 operation in 592 cycles (128 bytes)
1 operation in 1046 cycles (256 bytes)
tcrypt: test 18 (256 bit key, 1024 byte blocks): 1 operation in 3796 cycles (1024 bytes)
tcrypt: test 19 (256 bit key, 1420 byte blocks): 1 operation in 5283 cycles (1420 bytes)
tcrypt: test 20 (256 bit key, 4096 byte blocks): 1 operation in 14925 cycles (4096 bytes)
tcrypt: testing decryption speed of sync skcipher cfb(aes) using cfb(aes-aesni)
1 operation in 184 cycles (16 bytes)
1 operation in 415 cycles (64 bytes)
tcrypt: test 2 (128 bit key, 128 byte blocks): 1 operation in 661 cycles (128 bytes)
rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: {
1 operation in 1215 cycles (256 bytes)

None of the other modes (from 0 to 999) had problems.

mode=200 ran from 18:55:06 to 18:55:09, one of the longer durations,
but not unique (e.g., mode 300 took 4 sec, mode 400 took 5 sec).

Perhaps the cycles mode needs to call cond_resched() too?
Herbert Xu Oct. 9, 2022, 6:19 a.m. UTC | #3
On Sat, Oct 08, 2022 at 07:48:07PM +0000, Elliott, Robert (Servers) wrote:
>
> Perhaps the cycles mode needs to call cond_resched() too?

Yes, just make the cond_resched unconditional.  Having a few too many
rescheds shouldn't be an issue.

Thanks,
Elliott, Robert (Servers) Oct. 9, 2022, 7:58 p.m. UTC | #4
> -----Original Message-----
> From: Herbert Xu <herbert@gondor.apana.org.au>
> Sent: Sunday, October 9, 2022 1:20 AM
> To: Elliott, Robert (Servers) <elliott@hpe.com>
> Cc: Ard Biesheuvel <ardb@kernel.org>; Taehee Yoo <ap420073@gmail.com>; linux-
> crypto@vger.kernel.org; davem@davemloft.net; tglx@linutronix.de;
> mingo@redhat.com; bp@alien8.de; dave.hansen@linux.intel.com; x86@kernel.org;
> hpa@zytor.com; ebiggers@google.com
> Subject: Re: [PATCH] crypto: x86: Do not acquire fpu context for too long
> 
> On Sat, Oct 08, 2022 at 07:48:07PM +0000, Elliott, Robert (Servers) wrote:
> >
> > Perhaps the cycles mode needs to call cond_resched() too?
> 
> Yes, just make the cond_resched unconditional.  Having a few too many
> rescheds shouldn't be an issue.

This looks promising. I was able to trigger a lot of rcu stalls by setting:
  echo 2 > /sys/module/rcupdate/parameters/rcu_cpu_stall_timeout
  echo 200 > /sys/module/rcupdate/parameters/rcu_exp_cpu_stall_timeout

and running these concurrently:
  watch -n 0 modprobe tcrypt=200
  watch -n 0 module tcrypt=0 through 999

After changing tcrypt to call cond_resched in both cases, I don't see any
more rcu stalls.

I am getting miscompares from the extended self-test for crc32 and
crct10dif, and will investigate those further.

BTW, the way tcrypt always refuses to load leads to an ever-growing list in
the Call Traces:

kernel: Unloaded tainted modules: tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 t
crypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1
tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1
 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():
1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt()
:1 tcrypt():1 tcrypt():1 tcrypt():1
Elliott, Robert (Servers) Oct. 11, 2022, 3:16 p.m. UTC | #5
> > From: Herbert Xu <herbert@gondor.apana.org.au>
...
> > On Sat, Oct 08, 2022 at 07:48:07PM +0000, Elliott, Robert (Servers) wrote:
> > >
> > > Perhaps the cycles mode needs to call cond_resched() too?
> >
> > Yes, just make the cond_resched unconditional.  Having a few too many
> > rescheds shouldn't be an issue.
> 
> This looks promising. I was able to trigger a lot of rcu stalls by setting:
>   echo 2 > /sys/module/rcupdate/parameters/rcu_cpu_stall_timeout
>   echo 200 > /sys/module/rcupdate/parameters/rcu_exp_cpu_stall_timeout
> 
> and running these concurrently:
>   watch -n 0 modprobe tcrypt=200
>   watch -n 0 module tcrypt=0 through 999
> 
> I am getting miscompares from the extended self-test for crc32 and
> crct10dif, and will investigate those further.

The assembly functions for those two do not handle small sizes (unlike
crc32c, which handles all sizes), so the new do/while loop needs to
use larger steps and call the generic function for any leftover bytes.
I've corrected that patch.

> After changing tcrypt to call cond_resched in both cases, I don't see any
> more rcu stalls.

This ran cleanly with a few nights of testing with 
   CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y

which compares the accelerated implementation to the generic
implementation under randomly generated test vectors - a very
helpful test suite. It only works with data sizes up to
2 * PAGE_SIZE, though.

With the cond_resched changes, rcu stalls are gone with one exception.
If printing of Call Traces is triggered for any reason, either by a
test failing (e.g., because of the CRC loop bugs) or by using a SysRq
trigger like:
   echo l > /proc/sysrq-trigger

then the very act of generating and printing the Call Traces can
trigger rcu stall messages. Examples:

  Oct 09 14:56:39 kernel: cryptomgr: alg: shash: crct10dif-pclmul test failed (wrong result) on test vector "random: psize=4303 ksize=0", cfg="random: use_final src_divs=[<flush>95.43%@+4, <flush>4.57%@+6]"
  Oct 09 14:56:39 kernel: cryptomgr: alg: self-tests for crct10dif using crct10dif failed (rc=-22)
  Oct 09 14:56:39 kernel: ------------[ cut here ]------------
  Oct 09 14:56:39 kernel: alg: self-tests for crct10dif using crct10dif failed (rc=-22)
  Oct 09 14:56:39 kernel: WARNING: CPU: 55 PID: 10553 at crypto/testmgr.c:5837 alg_test.cold+0x3e/0x124
  ...
  Oct 09 14:56:42 kernel: R10: 0000000000000003 R11: 0000000000000246 R12: 0000000000040000
  Oct 09 14:56:42 kernel: rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:

and:
  Oct 10 19:45:02 kernel: sysrq: Show backtrace of all active CPUs
  ...
  Oct 10 19:45:03 kernel:  </TASK>
  ...
  Oct 10 19:45:06 kernel: rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:

If there are no WARNs or call traces, then there are no rcu stalls.

So, we should be aware that commit 09a5ef9644bc0e1 ("crypto: testmgr - WARN
on test failure") might introduce additional errors.

> BTW, the way tcrypt always refuses to load leads to an ever-growing list in
> the Call Traces:
> 
> kernel: Unloaded tainted modules: tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1
> tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1
> tcrypt():1 t
> crypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1
> tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1
> tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1
> tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1
> tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1
> tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1
>  tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1
> tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1
> tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():
> 1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1
> tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1
> tcrypt():1 tcrypt():1 tcrypt():1 tcrypt()
> :1 tcrypt():1 tcrypt():1 tcrypt():1

That was already noticed and a fix has already been posted for 6.1
as commit 47cc75aa9283 ("module: tracking: Keep a record of tainted unloaded
modules only")
Taehee Yoo Oct. 12, 2022, 6:08 a.m. UTC | #6
Hi Elliott, Robert

2022. 10. 10. 오전 4:58에 Elliott, Robert (Servers) 이(가) 쓴 글:
 >
 >
 >> -----Original Message-----
 >> From: Herbert Xu <herbert@gondor.apana.org.au>
 >> Sent: Sunday, October 9, 2022 1:20 AM
 >> To: Elliott, Robert (Servers) <elliott@hpe.com>
 >> Cc: Ard Biesheuvel <ardb@kernel.org>; Taehee Yoo 
<ap420073@gmail.com>; linux-
 >> crypto@vger.kernel.org; davem@davemloft.net; tglx@linutronix.de;
 >> mingo@redhat.com; bp@alien8.de; dave.hansen@linux.intel.com; 
x86@kernel.org;
 >> hpa@zytor.com; ebiggers@google.com
 >> Subject: Re: [PATCH] crypto: x86: Do not acquire fpu context for too 
long
 >>
 >> On Sat, Oct 08, 2022 at 07:48:07PM +0000, Elliott, Robert (Servers) 
wrote:
 >>>
 >>> Perhaps the cycles mode needs to call cond_resched() too?
 >>
 >> Yes, just make the cond_resched unconditional.  Having a few too many
 >> rescheds shouldn't be an issue.
 >
 > This looks promising. I was able to trigger a lot of rcu stalls by 
setting:
 >    echo 2 > /sys/module/rcupdate/parameters/rcu_cpu_stall_timeout
 >    echo 200 > /sys/module/rcupdate/parameters/rcu_exp_cpu_stall_timeout
 >
 > and running these concurrently:
 >    watch -n 0 modprobe tcrypt=200
 >    watch -n 0 module tcrypt=0 through 999
 >
 > After changing tcrypt to call cond_resched in both cases, I don't see any
 > more rcu stalls.
 >
 > I am getting miscompares from the extended self-test for crc32 and
 > crct10dif, and will investigate those further.
 >
 > BTW, the way tcrypt always refuses to load leads to an ever-growing 
list in
 > the Call Traces:
 >
 > kernel: Unloaded tainted modules: tcrypt():1 tcrypt():1 tcrypt():1 
tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 
tcrypt():1 tcrypt():1 tcrypt():1 t
 > crypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 
tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 
tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1
 > tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 
tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 
tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1
 >   tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 
tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 
tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():
 > 1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 
tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 
tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt():1 tcrypt()
 > :1 tcrypt():1 tcrypt():1 tcrypt():1
 >
 >
 >

I tested mb_aead as well.

I can find rcu stalls easily while testing gcm(aes-generic) with the 
below commands.
#shell1
while :
do
     modprobe tcrypt mode=215 num_mb=1024
done
#shell2
while :
do
     modprobe tcrypt mode=0
done

Then, I added cond_resched() as you mentioned.

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index a82679b576bb..eeb3abb4eece 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -415,6 +415,7 @@ static void test_mb_aead_speed(const char *algo, int 
enc, int secs,
                         } else {
                                 ret = test_mb_aead_cycles(data, enc, bs,
                                                           num_mb);
+                               cond_resched();
                         }

I can't see rcu stalls anymore, I think it works well.
diff mbox series

Patch

diff --git a/arch/x86/crypto/ecb_cbc_helpers.h b/arch/x86/crypto/ecb_cbc_helpers.h
index eaa15c7b29d6..551d8bdfd037 100644
--- a/arch/x86/crypto/ecb_cbc_helpers.h
+++ b/arch/x86/crypto/ecb_cbc_helpers.h
@@ -11,19 +11,35 @@ 
  * having to rely on indirect calls and retpolines.
  */
 
+#define ECB_CBC_WALK_MAX	4096
+
 #define ECB_WALK_START(req, bsize, fpu_blocks) do {			\
 	void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));	\
+	unsigned int walked_bytes = 0;					\
 	const int __bsize = (bsize);					\
 	struct skcipher_walk walk;					\
-	int err = skcipher_walk_virt(&walk, (req), false);		\
+	int err;							\
+									\
+	err = skcipher_walk_virt(&walk, (req), false);			\
 	while (walk.nbytes > 0) {					\
-		unsigned int nbytes = walk.nbytes;			\
-		bool do_fpu = (fpu_blocks) != -1 &&			\
-			      nbytes >= (fpu_blocks) * __bsize;		\
 		const u8 *src = walk.src.virt.addr;			\
-		u8 *dst = walk.dst.virt.addr;				\
 		u8 __maybe_unused buf[(bsize)];				\
-		if (do_fpu) kernel_fpu_begin()
+		u8 *dst = walk.dst.virt.addr;				\
+		unsigned int nbytes;					\
+		bool do_fpu;						\
+									\
+		if (walk.nbytes - walked_bytes > ECB_CBC_WALK_MAX) {	\
+			nbytes = ECB_CBC_WALK_MAX;			\
+			walked_bytes += ECB_CBC_WALK_MAX;		\
+		} else {						\
+			nbytes = walk.nbytes - walked_bytes;		\
+			walked_bytes = walk.nbytes;			\
+		}							\
+									\
+		do_fpu = (fpu_blocks) != -1 &&				\
+			 nbytes >= (fpu_blocks) * __bsize;		\
+		if (do_fpu)						\
+			kernel_fpu_begin()
 
 #define CBC_WALK_START(req, bsize, fpu_blocks)				\
 	ECB_WALK_START(req, bsize, fpu_blocks)
@@ -65,8 +81,12 @@ 
 } while (0)
 
 #define ECB_WALK_END()							\
-		if (do_fpu) kernel_fpu_end();				\
+		if (do_fpu)						\
+			kernel_fpu_end();				\
+		if (walked_bytes < walk.nbytes)				\
+			continue;					\
 		err = skcipher_walk_done(&walk, nbytes);		\
+		walked_bytes = 0;					\
 	}								\
 	return err;							\
 } while (0)