diff mbox series

[v3,19/29] crypto: mips/poly1305 - incorporate OpenSSL/CRYPTOGAMS optimized implementation

Message ID 20191007164610.6881-20-ard.biesheuvel@linaro.org
State New
Headers show
Series crypto: crypto API library interfaces for WireGuard | expand

Commit Message

Ard Biesheuvel Oct. 7, 2019, 4:46 p.m. UTC
This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation
for MIPS authored by Andy Polyakov, and contributed by him to the OpenSSL
project. The file 'poly1305-mips.pl' is taken straight from this upstream
GitHub repository [0] at commit 57c3a63be70b4f68b9eec1b043164ea790db6499,
and already contains all the changes required to build it as part of a
Linux kernel module.

[0] https://github.com/dot-asm/cryptogams

Co-developed-by: Andy Polyakov <appro@cryptogams.org>
Signed-off-by: Andy Polyakov <appro@cryptogams.org>

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

---
 arch/mips/crypto/Makefile         |   14 +
 arch/mips/crypto/poly1305-glue.c  |  203 ++++
 arch/mips/crypto/poly1305-mips.pl | 1246 ++++++++++++++++++++
 crypto/Kconfig                    |    6 +
 4 files changed, 1469 insertions(+)

-- 
2.20.1

Comments

René van Dorst Oct. 7, 2019, 9:02 p.m. UTC | #1
Quoting Ard Biesheuvel <ard.biesheuvel@linaro.org>:

> This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation

> for MIPS authored by Andy Polyakov, and contributed by him to the OpenSSL

> project. The file 'poly1305-mips.pl' is taken straight from this upstream

> GitHub repository [0] at commit 57c3a63be70b4f68b9eec1b043164ea790db6499,

> and already contains all the changes required to build it as part of a

> Linux kernel module.

>

> [0] https://github.com/dot-asm/cryptogams

>

> Co-developed-by: Andy Polyakov <appro@cryptogams.org>

> Signed-off-by: Andy Polyakov <appro@cryptogams.org>

> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

> ---

>  arch/mips/crypto/Makefile         |   14 +

>  arch/mips/crypto/poly1305-glue.c  |  203 ++++

>  arch/mips/crypto/poly1305-mips.pl | 1246 ++++++++++++++++++++

>  crypto/Kconfig                    |    6 +

>  4 files changed, 1469 insertions(+)

>

> <snip>


Hi Ard,

Is it also an option to include my mip32r2 optimized poly1305 version?

Below the results which shows a good improvement over the Andy  
Polyakov version.
I swapped the poly1305 assembly file and rename the function to  
<func_name>_mips
Full WireGuard source with the changes [0]

bytes |  RvD | openssl | delta | delta / openssl
     0 |  155 |   168   |   -13 |  -7,74%
     1 |  260 |   283   |   -23 |  -8,13%
    16 |  215 |   236   |   -21 |  -8,90%
    64 |  321 |   373   |   -52 | -13,94%
   576 | 1440 |  1813   |  -373 | -20,57%
  1280 | 2987 |  3801   |  -814 | -21,42%
  1408 | 3268 |  4161   |  -893 | -21,46%
  1420 | 3362 |  4267   |  -905 | -21,21%
  1440 | 3337 |  4250   |  -913 | -21,48%
  1536 | 3545 |  4531   |  -986 | -21,76%
  4096 | 9160 | 11755   | -2595 | -22,08%


Wireguard speedbench with my poly1305 implementation
[  412.010349] wireguard: chacha20 self-tests: pass
[  412.038265] wireguard: poly1305 self-tests: pass
[  412.050422] wireguard: chacha20poly1305 self-tests: pass
[  412.268724] wireguard: chacha20poly1305_encrypt:    1 bytes,        
0.252 MB/sec,     1603 cycles
[  412.488506] wireguard: chacha20poly1305_encrypt:   16 bytes,        
4.159 MB/sec,     1558 cycles
[  412.709162] wireguard: chacha20poly1305_encrypt:   64 bytes,       
15.356 MB/sec,     1696 cycles
[  412.932366] wireguard: chacha20poly1305_encrypt:  128 bytes,       
22.033 MB/sec,     2385 cycles
[  413.229175] wireguard: chacha20poly1305_encrypt: 1420 bytes,       
35.480 MB/sec,    16740 cycles
[  413.519035] wireguard: chacha20poly1305_encrypt: 1440 bytes,       
36.117 MB/sec,    16706 cycles
[  413.737346] wireguard: chacha20poly1305_decrypt:    1 bytes,        
0.246 MB/sec,     1654 cycles
[  413.957112] wireguard: chacha20poly1305_decrypt:   16 bytes,        
4.045 MB/sec,     1605 cycles
[  414.177758] wireguard: chacha20poly1305_decrypt:   64 bytes,       
14.953 MB/sec,     1744 cycles
[  414.400964] wireguard: chacha20poly1305_decrypt:  128 bytes,       
21.642 MB/sec,     2434 cycles
[  414.687803] wireguard: chacha20poly1305_decrypt: 1420 bytes,       
35.480 MB/sec,    16787 cycles
[  414.977636] wireguard: chacha20poly1305_decrypt: 1440 bytes,       
35.979 MB/sec,    16754 cycles
[  415.190375] wireguard: poly1305:    0 bytes,       0.000 MB/sec,     
   155 cycles
[  415.400864] wireguard: poly1305:    1 bytes,       1.375 MB/sec,     
   260 cycles
[  415.610655] wireguard: poly1305:   16 bytes,      25.817 MB/sec,     
   215 cycles
[  415.821149] wireguard: poly1305:   64 bytes,      72.936 MB/sec,     
   321 cycles
[  416.036357] wireguard: poly1305:  576 bytes,     162.047 MB/sec,     
  1440 cycles
[  416.263561] wireguard: poly1305: 1280 bytes,     177.124 MB/sec,     
  2987 cycles
[  416.484869] wireguard: poly1305: 1408 bytes,     178.320 MB/sec,     
  3268 cycles
[  416.715311] wireguard: poly1305: 1420 bytes,     174.693 MB/sec,     
  3362 cycles
[  416.945195] wireguard: poly1305: 1440 bytes,     178.527 MB/sec,     
  3337 cycles
[  417.176158] wireguard: poly1305: 1536 bytes,     179.296 MB/sec,     
  3545 cycles
[  417.432304] wireguard: poly1305: 4096 bytes,     186.718 MB/sec,     
  9160 cycles

Wireguard speedbench with the openssl poly1305 implementation
[  707.579242] wireguard: chacha20 self-tests: pass
[  707.610460] wireguard: poly1305 self-tests: pass
[  707.622678] wireguard: chacha20poly1305 self-tests: pass
[  707.838929] wireguard: chacha20poly1305_encrypt:    1 bytes,        
0.247 MB/sec,     1638 cycles
[  708.058698] wireguard: chacha20poly1305_encrypt:   16 bytes,        
4.072 MB/sec,     1590 cycles
[  708.279486] wireguard: chacha20poly1305_encrypt:   64 bytes,       
14.776 MB/sec,     1758 cycles
[  708.502893] wireguard: chacha20poly1305_encrypt:  128 bytes,       
21.105 MB/sec,     2490 cycles
[  708.803678] wireguard: chacha20poly1305_encrypt: 1420 bytes,       
33.583 MB/sec,    17695 cycles
[  709.103566] wireguard: chacha20poly1305_encrypt: 1440 bytes,       
34.194 MB/sec,    17669 cycles
[  709.327515] wireguard: chacha20poly1305_decrypt:    1 bytes,        
0.240 MB/sec,     1684 cycles
[  709.547304] wireguard: chacha20poly1305_decrypt:   16 bytes,        
3.963 MB/sec,     1638 cycles
[  709.768088] wireguard: chacha20poly1305_decrypt:   64 bytes,       
14.404 MB/sec,     1805 cycles
[  709.991500] wireguard: chacha20poly1305_decrypt:  128 bytes,       
20.739 MB/sec,     2534 cycles
[  710.282292] wireguard: chacha20poly1305_decrypt: 1420 bytes,       
33.583 MB/sec,    17740 cycles
[  710.582175] wireguard: chacha20poly1305_decrypt: 1440 bytes,       
34.057 MB/sec,    17718 cycles
[  710.800476] wireguard: poly1305:    0 bytes,       0.000 MB/sec,     
   168 cycles
[  711.011010] wireguard: poly1305:    1 bytes,       1.277 MB/sec,     
   283 cycles
[  711.220790] wireguard: poly1305:   16 bytes,      23.590 MB/sec,     
   236 cycles
[  711.431430] wireguard: poly1305:   64 bytes,      63.702 MB/sec,     
   373 cycles
[  711.648132] wireguard: poly1305:  576 bytes,     129.473 MB/sec,     
  1813 cycles
[  711.877393] wireguard: poly1305: 1280 bytes,     139.404 MB/sec,     
  3801 cycles
[  712.109065] wireguard: poly1305: 1408 bytes,     140.185 MB/sec,     
  4161 cycles
[  712.339563] wireguard: poly1305: 1420 bytes,     137.994 MB/sec,     
  4267 cycles
[  712.569491] wireguard: poly1305: 1440 bytes,     140.349 MB/sec,     
  4250 cycles
[  712.800790] wireguard: poly1305: 1536 bytes,     140.917 MB/sec,     
  4531 cycles
[  713.064421] wireguard: poly1305: 4096 bytes,     145.703 MB/sec,     
11755 cycles

Greats,

René

[0]: https://github.com/vDorst/wireguard/commits/mips-bench
Ard Biesheuvel Oct. 8, 2019, 5:55 a.m. UTC | #2
On Mon, 7 Oct 2019 at 23:02, René van Dorst <opensource@vdorst.com> wrote:
>

> Quoting Ard Biesheuvel <ard.biesheuvel@linaro.org>:

>

> > This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation

> > for MIPS authored by Andy Polyakov, and contributed by him to the OpenSSL

> > project. The file 'poly1305-mips.pl' is taken straight from this upstream

> > GitHub repository [0] at commit 57c3a63be70b4f68b9eec1b043164ea790db6499,

> > and already contains all the changes required to build it as part of a

> > Linux kernel module.

> >

> > [0] https://github.com/dot-asm/cryptogams

> >

> > Co-developed-by: Andy Polyakov <appro@cryptogams.org>

> > Signed-off-by: Andy Polyakov <appro@cryptogams.org>

> > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

> > ---

> >  arch/mips/crypto/Makefile         |   14 +

> >  arch/mips/crypto/poly1305-glue.c  |  203 ++++

> >  arch/mips/crypto/poly1305-mips.pl | 1246 ++++++++++++++++++++

> >  crypto/Kconfig                    |    6 +

> >  4 files changed, 1469 insertions(+)

> >

> > <snip>

>

> Hi Ard,

>

> Is it also an option to include my mip32r2 optimized poly1305 version?

>

> Below the results which shows a good improvement over the Andy

> Polyakov version.

> I swapped the poly1305 assembly file and rename the function to

> <func_name>_mips

> Full WireGuard source with the changes [0]

>

> bytes |  RvD | openssl | delta | delta / openssl

>      0 |  155 |   168   |   -13 |  -7,74%

>      1 |  260 |   283   |   -23 |  -8,13%

>     16 |  215 |   236   |   -21 |  -8,90%

>     64 |  321 |   373   |   -52 | -13,94%

>    576 | 1440 |  1813   |  -373 | -20,57%

>   1280 | 2987 |  3801   |  -814 | -21,42%

>   1408 | 3268 |  4161   |  -893 | -21,46%

>   1420 | 3362 |  4267   |  -905 | -21,21%

>   1440 | 3337 |  4250   |  -913 | -21,48%

>   1536 | 3545 |  4531   |  -986 | -21,76%

>   4096 | 9160 | 11755   | -2595 | -22,08%

>


Hi René,

I agree that, given these numbers, we should definitely switch to your
code when building for 32r2 specifically. I'll incorporate that for
the next revision.
Andy Polyakov Oct. 8, 2019, 11:38 a.m. UTC | #3
Hi,

On 10/7/19 11:02 PM, René van Dorst wrote:
> Quoting Ard Biesheuvel <ard.biesheuvel@linaro.org>:

> 

>> This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305

>> implementation

>> for MIPS authored by Andy Polyakov, and contributed by him to the OpenSSL

>> project.


Formally speaking this is a little bit misleading statement. Cryptogams
poly1305-mips module implements both 64- and 32-bit code paths, while
what you'll find in OpenSSL is 64-only implementation. But in either case...

>> <snip>

> 

> Hi Ard,

> 

> Is it also an option to include my mip32r2 optimized poly1305 version?

> 

> Below the results which shows a good improvement over the Andy Polyakov

> version.

> I swapped the poly1305 assembly file and rename the function to

> <func_name>_mips

> Full WireGuard source with the changes [0]

> 

> bytes |  RvD | openssl | delta | delta / openssl

>  ...

>  4096 | 9160 | 11755   | -2595 | -22,08%


I assume that the presented results depict regression after switch to
cryptogams module. Right? RvD implementation distinguishes itself in two
ways:

1. some of additions in inner loop are replaced with multiply-by-1-n-add;
2. carry chain at the end of the inner loop is effectively fused with
beginning of the said loop/taken out of the loop.

I recall attempting 1. and chosen not to do it with following rationale.
On processor I have access to, Octeon II, it made no significant
difference. It was better, but only marginally. And it's understandable,
because Octeon II should have lesser difficulty pairing those additions
with multiply-n-add instructions. But since multiplication is an
expensive operation, it can be pretty slow, I reckoned that on processor
less potent than Octeon II it might be more appropriate to minimize
amount of multiplication-n-add instructions. In other words idea is not
(and never has been) to get fixated on specific processor at hand, but
try to find a sensible compromise that would produce reasonable
performance on a range of processors. Of course problem is that it's
just an assumption I made here, and it could turn wrong in practice:-)
So I wonder which processor do you run on, René? For reference I measure
>70MB/sec for 1KB blocks for chacha20poly1305 on 1GHz Octeon II. You

report ~34MB/sec, so it ought to be something different. Given second
data point it might be appropriate to reconsider and settle for
multiply-by-1-n-add.

As for 2. I haven't considered it. Since it's a back-to-back dependency
chain, if fused with top of the loop, it actually has more promising
potential than 1. And it would improve all results, not only MISP32R2.
Would you trust me with adopting it to my module? Naturally with due credit.

Cheers.
René van Dorst Oct. 8, 2019, 5:46 p.m. UTC | #4
Quoting Andy Polyakov <appro@cryptogams.org>:

> Hi,

>

> On 10/7/19 11:02 PM, René van Dorst wrote:

>> Quoting Ard Biesheuvel <ard.biesheuvel@linaro.org>:

>>

>>> This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305

>>> implementation

>>> for MIPS authored by Andy Polyakov, and contributed by him to the OpenSSL

>>> project.

>

> Formally speaking this is a little bit misleading statement. Cryptogams

> poly1305-mips module implements both 64- and 32-bit code paths, while

> what you'll find in OpenSSL is 64-only implementation. But in either case...


Hi Andy,

Sorry for the confustion and that it is misleading.
I took the assembly output generated by  
arch/mips/crypto/poly1305-mips.pl which
is included in Ard series [0]. Output is generated while compiling mips32r2
kernel with Ard series included.
So it should generated the mips32r2 variant [1] and I appended the function
names with "_mips" so they match the current WireGuard implementation. So that
it is now a drop-in replacement.

>>> <snip>

>>

>> Hi Ard,

>>

>> Is it also an option to include my mip32r2 optimized poly1305 version?

>>

>> Below the results which shows a good improvement over the Andy Polyakov

>> version.

>> I swapped the poly1305 assembly file and rename the function to

>> <func_name>_mips

>> Full WireGuard source with the changes [0]

>>

>> bytes |  RvD | openssl | delta | delta / openssl

>> ...

>> 4096 | 9160 | 11755   | -2595 | -22,08%

>

> I assume that the presented results depict regression after switch to

> cryptogams module. Right?


Yes, by only swapping poly1305 assembly file.

> RvD implementation distinguishes itself in two ways:

>

> 1. some of additions in inner loop are replaced with multiply-by-1-n-add;

> 2. carry chain at the end of the inner loop is effectively fused with

> beginning of the said loop/taken out of the loop.

>

> I recall attempting 1. and chosen not to do it with following rationale.

> On processor I have access to, Octeon II, it made no significant

> difference. It was better, but only marginally. And it's understandable,

> because Octeon II should have lesser difficulty pairing those additions

> with multiply-n-add instructions. But since multiplication is an

> expensive operation, it can be pretty slow, I reckoned that on processor

> less potent than Octeon II it might be more appropriate to minimize

> amount of multiplication-n-add instructions. In other words idea is not

> (and never has been) to get fixated on specific processor at hand, but

> try to find a sensible compromise that would produce reasonable

> performance on a range of processors. Of course problem is that it's

> just an assumption I made here, and it could turn wrong in practice:-)


I used poly1305-donna32.c [4] as reference for my version.
Using multiply-n-add is a logical choice for mips32r2 with this code.
I only using multiply-by-1-n-add after the multiply-n-add for adding the carry
of the previous calculation. It seems to have no downside.
I manually checked for stales by adding nop instruction after multiply-n-add.
But the benchmark result shows me an increase in cpu cycles with the nops.

So using multiply-by-1-n-add only for additions is slow.

> So I wonder which processor do you run on, René?


I am using a Mediatek MT7621 mips32r2 running at 880MHz. [3]

> 70MB/sec for 1KB blocks for chacha20poly1305 on 1GHz Octeon II. You

> report ~34MB/sec, so it ought to be something different. Given second

> data point it might be appropriate to reconsider and settle for

> multiply-by-1-n-add.

>


multiply-by-1-n-add is slow as a standalone feature.
I would not recommend it.

> As for 2. I haven't considered it. Since it's a back-to-back dependency

> chain, if fused with top of the loop, it actually has more promising

> potential than 1. And it would improve all results, not only MISP32R2.

> Would you trust me with adopting it to my module? Naturally with due credit.


Yes that is totally fine.
I hope that you found more spots that we can improve.

>

> Cheers.


Bench results with the generic version of chacha20 and poly1305 that  
comes with
WireGuard.

[ 1328.931574] wireguard: chacha20poly1305 self-tests: pass
[ 1329.151368] wireguard: chacha20poly1305_encrypt:    1 bytes,        
0.228 MB/sec,     1779 cycles
[ 1329.371232] wireguard: chacha20poly1305_encrypt:   16 bytes,        
3.716 MB/sec,     1752 cycles
[ 1329.592467] wireguard: chacha20poly1305_encrypt:   64 bytes,       
13.005 MB/sec,     2016 cycles
[ 1329.816587] wireguard: chacha20poly1305_encrypt:  128 bytes,       
18.200 MB/sec,     2902 cycles
[ 1330.128756] wireguard: chacha20poly1305_encrypt: 1408 bytes,       
28.735 MB/sec,    20550 cycles
[ 1330.441997] wireguard: chacha20poly1305_encrypt: 1420 bytes,       
28.032 MB/sec,    21247 cycles
[ 1330.752105] wireguard: chacha20poly1305_encrypt: 1440 bytes,       
28.426 MB/sec,    21268 cycles
[ 1330.969983] wireguard: chacha20poly1305_decrypt:    1 bytes,        
0.222 MB/sec,     1827 cycles
[ 1331.189853] wireguard: chacha20poly1305_decrypt:   16 bytes,        
3.620 MB/sec,     1799 cycles
[ 1331.411065] wireguard: chacha20poly1305_decrypt:   64 bytes,       
12.695 MB/sec,     2060 cycles
[ 1331.635191] wireguard: chacha20poly1305_decrypt:  128 bytes,       
17.919 MB/sec,     2947 cycles
[ 1331.947393] wireguard: chacha20poly1305_decrypt: 1408 bytes,       
28.735 MB/sec,    20597 cycles
[ 1332.260602] wireguard: chacha20poly1305_decrypt: 1420 bytes,       
28.032 MB/sec,    21287 cycles
[ 1332.570649] wireguard: chacha20poly1305_decrypt: 1440 bytes,       
28.426 MB/sec,    21307 cycles
[ 1332.782310] wireguard: poly1305:    0 bytes,       0.000 MB/sec,     
   176 cycles
[ 1332.992837] wireguard: poly1305:    1 bytes,       1.240 MB/sec,     
   290 cycles
[ 1333.202706] wireguard: poly1305:   16 bytes,      21.672 MB/sec,     
   262 cycles
[ 1333.413510] wireguard: poly1305:   64 bytes,      55.639 MB/sec,     
   434 cycles
[ 1333.632105] wireguard: poly1305:  576 bytes,     103.875 MB/sec,     
  2280 cycles
[ 1333.863911] wireguard: poly1305: 1280 bytes,     110.473 MB/sec,     
  4816 cycles
[ 1334.096050] wireguard: poly1305: 1408 bytes,     111.046 MB/sec,     
  5275 cycles
[ 1334.326574] wireguard: poly1305: 1420 bytes,     109.691 MB/sec,     
  5387 cycles
[ 1334.556580] wireguard: poly1305: 1440 bytes,     111.098 MB/sec,     
  5390 cycles
[ 1334.788215] wireguard: poly1305: 1536 bytes,     111.474 MB/sec,     
  5740 cycles
[ 1335.071139] wireguard: poly1305: 4096 bytes,     114.843 MB/sec,     
14957 cycles
[ 1335.281688] wireguard: chacha20:    0 bytes,       0.000 MB/sec,     
    43 cycles
[ 1335.494245] wireguard: chacha20:    1 bytes,       0.652 MB/sec,     
   592 cycles
[ 1335.704250] wireguard: chacha20:    2 bytes,       1.306 MB/sec,     
   593 cycles
[ 1335.914301] wireguard: chacha20:    3 bytes,       1.928 MB/sec,     
   603 cycles
[ 1336.124247] wireguard: chacha20:    4 bytes,       2.613 MB/sec,     
   593 cycles
[ 1336.334283] wireguard: chacha20:    8 bytes,       5.178 MB/sec,     
   599 cycles
[ 1336.544339] wireguard: chacha20:   16 bytes,      10.146 MB/sec,     
   612 cycles
[ 1336.754727] wireguard: chacha20:   64 bytes,      36.003 MB/sec,     
   696 cycles
[ 1336.989007] wireguard: chacha20:  576 bytes,      40.593 MB/sec,     
  5908 cycles
[ 1337.262407] wireguard: chacha20: 1280 bytes,      41.015 MB/sec,     
13081 cycles
[ 1337.538436] wireguard: chacha20: 1408 bytes,      40.954 MB/sec,     
14381 cycles
[ 1337.821086] wireguard: chacha20: 1420 bytes,      39.813 MB/sec,     
14947 cycles
[ 1338.101206] wireguard: chacha20: 1440 bytes,      40.237 MB/sec,     
14975 cycles
[ 1338.384518] wireguard: chacha20: 1536 bytes,      41.015 MB/sec,     
15686 cycles
[ 1338.785923] wireguard: chacha20: 4096 bytes,      41.406 MB/sec,     
41757 cycles

Again my version but also with chacha20 results.
[ 1481.872439] wireguard: chacha20 self-tests: pass
[ 1481.900361] wireguard: poly1305 self-tests: pass
[ 1481.912533] wireguard: chacha20poly1305 self-tests: pass
[ 1482.130557] wireguard: chacha20poly1305_encrypt:    1 bytes,        
0.251 MB/sec,     1603 cycles
[ 1482.350349] wireguard: chacha20poly1305_encrypt:   16 bytes,        
4.157 MB/sec,     1558 cycles
[ 1482.570994] wireguard: chacha20poly1305_encrypt:   64 bytes,       
15.319 MB/sec,     1696 cycles
[ 1482.794197] wireguard: chacha20poly1305_encrypt:  128 bytes,       
22.021 MB/sec,     2386 cycles
[ 1483.088083] wireguard: chacha20poly1305_encrypt: 1408 bytes,       
36.657 MB/sec,    16105 cycles
[ 1483.381047] wireguard: chacha20poly1305_encrypt: 1420 bytes,       
35.480 MB/sec,    16746 cycles
[ 1483.670908] wireguard: chacha20poly1305_encrypt: 1440 bytes,       
36.117 MB/sec,    16713 cycles
[ 1483.889186] wireguard: chacha20poly1305_decrypt:    1 bytes,        
0.245 MB/sec,     1653 cycles
[ 1484.108959] wireguard: chacha20poly1305_decrypt:   16 bytes,        
4.044 MB/sec,     1605 cycles
[ 1484.329609] wireguard: chacha20poly1305_decrypt:   64 bytes,       
14.934 MB/sec,     1743 cycles
[ 1484.552815] wireguard: chacha20poly1305_decrypt:  128 bytes,       
21.630 MB/sec,     2433 cycles
[ 1484.836716] wireguard: chacha20poly1305_decrypt: 1408 bytes,       
36.523 MB/sec,    16158 cycles
[ 1485.129692] wireguard: chacha20poly1305_decrypt: 1420 bytes,       
35.480 MB/sec,    16794 cycles
[ 1485.419518] wireguard: chacha20poly1305_decrypt: 1440 bytes,       
35.979 MB/sec,    16760 cycles
[ 1485.632222] wireguard: poly1305:    0 bytes,       0.000 MB/sec,     
   154 cycles
[ 1485.842700] wireguard: poly1305:    1 bytes,       1.360 MB/sec,     
   257 cycles
[ 1486.052492] wireguard: poly1305:   16 bytes,      25.513 MB/sec,     
   212 cycles
[ 1486.263004] wireguard: poly1305:   64 bytes,      72.887 MB/sec,     
   323 cycles
[ 1486.478211] wireguard: poly1305:  576 bytes,     161.993 MB/sec,     
  1440 cycles
[ 1486.705407] wireguard: poly1305: 1280 bytes,     177.001 MB/sec,     
  2986 cycles
[ 1486.926708] wireguard: poly1305: 1408 bytes,     178.185 MB/sec,     
  3266 cycles
[ 1487.157166] wireguard: poly1305: 1420 bytes,     174.693 MB/sec,     
  3363 cycles
[ 1487.387048] wireguard: poly1305: 1440 bytes,     178.527 MB/sec,     
  3338 cycles
[ 1487.618013] wireguard: poly1305: 1536 bytes,     179.150 MB/sec,     
  3546 cycles
[ 1487.874161] wireguard: poly1305: 4096 bytes,     186.718 MB/sec,     
  9162 cycles
[ 1488.081633] wireguard: chacha20:    0 bytes,       0.000 MB/sec,     
    28 cycles
[ 1488.294111] wireguard: chacha20:    1 bytes,       0.693 MB/sec,     
   557 cycles
[ 1488.504097] wireguard: chacha20:    2 bytes,       1.380 MB/sec,     
   557 cycles
[ 1488.714109] wireguard: chacha20:    3 bytes,       2.066 MB/sec,     
   560 cycles
[ 1488.924084] wireguard: chacha20:    4 bytes,       2.776 MB/sec,     
   554 cycles
[ 1489.134096] wireguard: chacha20:    8 bytes,       5.540 MB/sec,     
   557 cycles
[ 1489.344120] wireguard: chacha20:   16 bytes,      10.970 MB/sec,     
   562 cycles
[ 1489.554217] wireguard: chacha20:   64 bytes,      42.424 MB/sec,     
   583 cycles
[ 1489.784540] wireguard: chacha20:  576 bytes,      48.394 MB/sec,     
  4947 cycles
[ 1490.042459] wireguard: chacha20: 1280 bytes,      48.950 MB/sec,     
10947 cycles
[ 1490.307525] wireguard: chacha20: 1408 bytes,      49.010 MB/sec,     
12035 cycles
[ 1490.579962] wireguard: chacha20: 1420 bytes,      47.261 MB/sec,     
12558 cycles
[ 1490.850028] wireguard: chacha20: 1440 bytes,      47.927 MB/sec,     
12570 cycles
[ 1491.122613] wireguard: chacha20: 1536 bytes,      48.925 MB/sec,     
13128 cycles
[ 1491.494187] wireguard: chacha20: 4096 bytes,      49.218 MB/sec,     
34941 cycles

Greats,

René

[0]:  
https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/commit/?h=wireguard-crypto-library-api-v3&id=62d2dc65ab455a95eb5deb8bdef1dd7bb4cc754d
[1]:  
https://github.com/vDorst/wireguard/commit/5498f0900829e01b571644ea1f799f48a31eb290
[2]:  
https://github.com/vDorst/wireguard/blob/45ede7c0cd675fd0de6b95af33eb3ac9746a8901/src/crypto/zinc/speedtest/poly1305.h
[3]: https://www.mediatek.com/products/homeNetworking/mt7621n-a
[4]:  
https://github.com/vDorst/wireguard/blob/fbb8035a46a84ac7c5ee53c875c1de6f202d0884/src/crypto/zinc/poly1305/poly1305-donna32.c#L40
Andy Polyakov Oct. 11, 2019, 2:14 p.m. UTC | #5
Hi,

On 10/8/19 1:38 PM, Andy Polyakov wrote:
>>> <snip>

>>

>> Hi Ard,

>>

>> Is it also an option to include my mip32r2 optimized poly1305 version?

>>

>> Below the results which shows a good improvement over the Andy Polyakov

>> version.

>> I swapped the poly1305 assembly file and rename the function to

>> <func_name>_mips

>> Full WireGuard source with the changes [0]

>>

>> bytes |  RvD | openssl | delta | delta / openssl

>>  ...

>>  4096 | 9160 | 11755   | -2595 | -22,08%


Update is pushed to cryptogams. Thanks to René for ideas, feedback and
testing! There is even a question about supporting DSP ASE, let's
discuss details off-list first.

As for multiply-by-1-n-add.

> I assume that the presented results depict regression after switch to

> cryptogams module. Right? RvD implementation distinguishes itself in two

> ways:

>

> 1. some of additions in inner loop are replaced with multiply-by-1-n-add;

> ...

>

> I recall attempting 1. and chosen not to do it with following rationale.

> On processor I have access to, Octeon II, it made no significant

> difference. It was better, but only marginally. And it's understandable,

> because Octeon II should have lesser difficulty pairing those additions

> with multiply-n-add instructions. But since multiplication is an

> expensive operation, it can be pretty slow, I reckoned that on processor

> less potent than Octeon II it might be more appropriate to minimize

> amount of multiplication-n-add instructions.


As an example, MIPS 1004K manual discusses that that there are two
options for multiplier for this core, proper and poor-man's. Proper
multiplier unit can issue multiplication or multiplication-n-add each
cycle, with multiplication latency apparently being 4. Poor-man's unit
on the other hand can issue multiplication each 32nd[!] cycle with
corresponding latency. This means that core with poor-man's unit would
perform ~13% worse than it could have been. Updated module does use
multiply-by-1-n-add, so this note is effectively for reference in case
"poor man" wonders.

Cheers.
René van Dorst Oct. 11, 2019, 5:21 p.m. UTC | #6
Hi Andy,

Quoting Andy Polyakov <appro@cryptogams.org>:

> Hi,

>

> On 10/8/19 1:38 PM, Andy Polyakov wrote:

>>>> <snip>

>>>

>>> Hi Ard,

>>>

>>> Is it also an option to include my mip32r2 optimized poly1305 version?

>>>

>>> Below the results which shows a good improvement over the Andy Polyakov

>>> version.

>>> I swapped the poly1305 assembly file and rename the function to

>>> <func_name>_mips

>>> Full WireGuard source with the changes [0]

>>>

>>> bytes |  RvD | openssl | delta | delta / openssl

>>>  ...

>>>  4096 | 9160 | 11755   | -2595 | -22,08%

>

> Update is pushed to cryptogams. Thanks to René for ideas, feedback and

> testing! There is even a question about supporting DSP ASE, let's

> discuss details off-list first.

>


Thanks!
I see that you have found an other spot to save 1 cycle.

Last results: poly1305: 4096 bytes,     188.671 MB/sec,     9066 cycles

I also wonder if we can also replace the "li $x, -4" and "and $x" with  
"sll $x"
combination on other places like [0], also on line 1169?

Replace this on line 1169, works on my device.

-       li      $in0,-4
         srl     $ctx,$tmp4,2
-       and     $in0,$in0,$tmp4
         andi    $tmp4,$tmp4,3
+       sll     $in0, $ctx, 2
         addu    $ctx,$ctx,$in0

> As for multiply-by-1-n-add.

>

>> I assume that the presented results depict regression after switch to

>> cryptogams module. Right? RvD implementation distinguishes itself in two

>> ways:

>>

>> 1. some of additions in inner loop are replaced with multiply-by-1-n-add;

>> ...

>>

>> I recall attempting 1. and chosen not to do it with following rationale.

>> On processor I have access to, Octeon II, it made no significant

>> difference. It was better, but only marginally. And it's understandable,

>> because Octeon II should have lesser difficulty pairing those additions

>> with multiply-n-add instructions. But since multiplication is an

>> expensive operation, it can be pretty slow, I reckoned that on processor

>> less potent than Octeon II it might be more appropriate to minimize

>> amount of multiplication-n-add instructions.

>

> As an example, MIPS 1004K manual discusses that that there are two

> options for multiplier for this core, proper and poor-man's. Proper

> multiplier unit can issue multiplication or multiplication-n-add each

> cycle, with multiplication latency apparently being 4. Poor-man's unit

> on the other hand can issue multiplication each 32nd[!] cycle with

> corresponding latency. This means that core with poor-man's unit would

> perform ~13% worse than it could have been. Updated module does use

> multiply-by-1-n-add, so this note is effectively for reference in case

> "poor man" wonders.

>

> Cheers.


Thanks for this information.
I wonder how many devices do exist with the "poor man" version.

Greats,

René

[0]:  
https://github.com/dot-asm/cryptogams/blob/d22ade312a7af958ec955620b0d241cf42c37feb/mips/poly1305-mips.pl#L461
Andy Polyakov Oct. 11, 2019, 6:49 p.m. UTC | #7
Hi,

On 10/11/2019 7:21 PM, René van Dorst wrote:
>

> ...

>

> I also wonder if we can also replace the "li $x, -4" and "and $x" with

> "sll $x"

> combination on other places like [0], also on line 1169?

>

> Replace this on line 1169, works on my device.

>

> -       li      $in0,-4

>         srl     $ctx,$tmp4,2

> -       and     $in0,$in0,$tmp4

>         andi    $tmp4,$tmp4,3

> +       sll     $in0, $ctx, 2

>         addu    $ctx,$ctx,$in0


The reason for why I chose to keep 'li $in0,-4' in poly1305_emit is
because the original sequence has higher instruction-level parallelism.
Yes, it's one extra instruction, but if all of them get paired, they
will execute faster. Yes, it doesn't help single-issue processors such
as yours, but thing is that next instruction depends on last, and then
*formally* it's more appropriate to aim for higher ILP as general rule.
Just in case, in poly1305_blocks is different, because dependent
instruction does not immediately follow one that computes the residue.

>> As for multiply-by-1-n-add.

>>

>

> I wonder how many devices do exist with the "poor man" version.


Well, it's not just how many devices, but more specifically how many of
those will end up running the code in question. I would guess poor-man's
unit would be found in ultra-low-power microcontroller, so... As
implied, it's probably sufficient to keep this in mind just in case :-)

Cheers.
Arnd Bergmann Oct. 11, 2019, 9:38 p.m. UTC | #8
On Fri, Oct 11, 2019 at 7:21 PM René van Dorst <opensource@vdorst.com> wrote:
> Quoting Andy Polyakov <appro@cryptogams.org>:

> > On 10/8/19 1:38 PM, Andy Polyakov wrote:

> > As an example, MIPS 1004K manual discusses that that there are two

> > options for multiplier for this core, proper and poor-man's. Proper

> > multiplier unit can issue multiplication or multiplication-n-add each

> > cycle, with multiplication latency apparently being 4. Poor-man's unit

> > on the other hand can issue multiplication each 32nd[!] cycle with

> > corresponding latency. This means that core with poor-man's unit would

> > perform ~13% worse than it could have been. Updated module does use

> > multiply-by-1-n-add, so this note is effectively for reference in case

> > "poor man" wonders.

>

> Thanks for this information.

> I wonder how many devices do exist with the "poor man" version.


I'm fairly sure the MT7621 is the only 1004k supported by the mainline
Linux kernel today, and likely the only one that will ever run this code.

Ralink/Mediatek, Lantiq/Intel and Ikanos/Qualcomm had some other
SoCs based on the related 34k core with an optional iterative multiplier,
out of those only Lantiq ARX100/VRX200 has support in Linux or
OpenWRT.

Everyone else (in the wireless and router space at least) seems to have
skipped the 34k/1004k and only used 24k or 74k/1074k based chips that
are the most common and have a fast multiplier, or some custom mips
core.

      Arnd
diff mbox series

Patch

diff --git a/arch/mips/crypto/Makefile b/arch/mips/crypto/Makefile
index b528b9d300f1..8e1deaf00e0c 100644
--- a/arch/mips/crypto/Makefile
+++ b/arch/mips/crypto/Makefile
@@ -8,3 +8,17 @@  obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32-mips.o
 obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
 chacha-mips-y := chacha-core.o chacha-glue.o
 AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
+
+obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
+poly1305-mips-y := poly1305-core.o poly1305-glue.o
+
+perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32
+perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
+
+$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
+	$(call if_changed,perlasm)
+
+targets += poly1305-core.S
diff --git a/arch/mips/crypto/poly1305-glue.c b/arch/mips/crypto/poly1305-glue.c
new file mode 100644
index 000000000000..7e1742cdc5a4
--- /dev/null
+++ b/arch/mips/crypto/poly1305-glue.c
@@ -0,0 +1,203 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/unaligned.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/poly1305.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+asmlinkage void poly1305_init_mips(void *state, const u8 *key);
+asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit_mips(void *state, __le32 *digest, const u32 *nonce);
+
+void poly1305_init(struct poly1305_desc_ctx *dctx, const u8 *key)
+{
+	poly1305_init_mips(&dctx->h, key);
+	dctx->s[0] = get_unaligned_le32(key + 16);
+	dctx->s[1] = get_unaligned_le32(key + 20);
+	dctx->s[2] = get_unaligned_le32(key + 24);
+	dctx->s[3] = get_unaligned_le32(key + 28);
+	dctx->buflen = 0;
+}
+EXPORT_SYMBOL(poly1305_init);
+
+static int mips_poly1305_init(struct shash_desc *desc)
+{
+	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+	dctx->buflen = 0;
+	dctx->rset = 0;
+	dctx->sset = false;
+
+	return 0;
+}
+
+static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
+				 u32 len, u32 hibit)
+{
+	if (unlikely(!dctx->sset)) {
+		if (!dctx->rset) {
+			poly1305_init_mips(&dctx->h, src);
+			src += POLY1305_BLOCK_SIZE;
+			len -= POLY1305_BLOCK_SIZE;
+			dctx->rset = 1;
+		}
+		if (len >= POLY1305_BLOCK_SIZE) {
+			dctx->s[0] = get_unaligned_le32(src +  0);
+			dctx->s[1] = get_unaligned_le32(src +  4);
+			dctx->s[2] = get_unaligned_le32(src +  8);
+			dctx->s[3] = get_unaligned_le32(src + 12);
+			src += POLY1305_BLOCK_SIZE;
+			len -= POLY1305_BLOCK_SIZE;
+			dctx->sset = true;
+		}
+		if (len < POLY1305_BLOCK_SIZE)
+			return;
+	}
+
+	len &= ~(POLY1305_BLOCK_SIZE - 1);
+
+	poly1305_blocks_mips(&dctx->h, src, len, hibit);
+}
+
+static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
+				unsigned int len)
+{
+	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+	if (unlikely(dctx->buflen)) {
+		u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
+
+		memcpy(dctx->buf + dctx->buflen, src, bytes);
+		src += bytes;
+		len -= bytes;
+		dctx->buflen += bytes;
+
+		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+			mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
+			dctx->buflen = 0;
+		}
+	}
+
+	if (likely(len >= POLY1305_BLOCK_SIZE)) {
+		mips_poly1305_blocks(dctx, src, len, 1);
+		src += round_down(len, POLY1305_BLOCK_SIZE);
+		len %= POLY1305_BLOCK_SIZE;
+	}
+
+	if (unlikely(len)) {
+		dctx->buflen = len;
+		memcpy(dctx->buf, src, len);
+	}
+	return 0;
+}
+
+void poly1305_update(struct poly1305_desc_ctx *dctx, const u8 *src,
+		     unsigned int nbytes)
+{
+	if (unlikely(dctx->buflen)) {
+		u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
+
+		memcpy(dctx->buf + dctx->buflen, src, bytes);
+		src += bytes;
+		nbytes -= bytes;
+		dctx->buflen += bytes;
+
+		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+			poly1305_blocks_mips(&dctx->h, dctx->buf,
+					     POLY1305_BLOCK_SIZE, 1);
+			dctx->buflen = 0;
+		}
+	}
+
+	if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
+		unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
+
+		poly1305_blocks_mips(&dctx->h, src, len, 1);
+		src += len;
+		nbytes %= POLY1305_BLOCK_SIZE;
+	}
+
+	if (unlikely(nbytes)) {
+		dctx->buflen = nbytes;
+		memcpy(dctx->buf, src, nbytes);
+	}
+}
+EXPORT_SYMBOL(poly1305_update);
+
+void poly1305_final(struct poly1305_desc_ctx *dctx, u8 *dst)
+{
+	__le32 digest[4];
+	u64 f = 0;
+
+	if (unlikely(dctx->buflen)) {
+		dctx->buf[dctx->buflen++] = 1;
+		memset(dctx->buf + dctx->buflen, 0,
+		       POLY1305_BLOCK_SIZE - dctx->buflen);
+		poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+	}
+
+	poly1305_emit_mips(&dctx->h, digest, dctx->s);
+
+	/* mac = (h + s) % (2^128) */
+	f = (f >> 32) + le32_to_cpu(digest[0]);
+	put_unaligned_le32(f, dst);
+	f = (f >> 32) + le32_to_cpu(digest[1]);
+	put_unaligned_le32(f, dst + 4);
+	f = (f >> 32) + le32_to_cpu(digest[2]);
+	put_unaligned_le32(f, dst + 8);
+	f = (f >> 32) + le32_to_cpu(digest[3]);
+	put_unaligned_le32(f, dst + 12);
+
+	*dctx = (struct poly1305_desc_ctx){};
+}
+EXPORT_SYMBOL(poly1305_final);
+
+static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
+{
+	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+	if (unlikely(!dctx->sset))
+		return -ENOKEY;
+
+	poly1305_final(dctx, dst);
+	return 0;
+}
+
+static struct shash_alg mips_poly1305_alg = {
+	.init			= mips_poly1305_init,
+	.update			= mips_poly1305_update,
+	.final			= mips_poly1305_final,
+	.digestsize		= POLY1305_DIGEST_SIZE,
+	.descsize		= sizeof(struct poly1305_desc_ctx),
+
+	.base.cra_name		= "poly1305",
+	.base.cra_driver_name	= "poly1305-mips",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= POLY1305_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+};
+
+static int __init mips_poly1305_mod_init(void)
+{
+	return crypto_register_shash(&mips_poly1305_alg);
+}
+
+static void __exit mips_poly1305_mod_exit(void)
+{
+	crypto_unregister_shash(&mips_poly1305_alg);
+}
+
+module_init(mips_poly1305_mod_init);
+module_exit(mips_poly1305_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("poly1305");
+MODULE_ALIAS_CRYPTO("poly1305-mips");
diff --git a/arch/mips/crypto/poly1305-mips.pl b/arch/mips/crypto/poly1305-mips.pl
new file mode 100644
index 000000000000..02928df00a88
--- /dev/null
+++ b/arch/mips/crypto/poly1305-mips.pl
@@ -0,0 +1,1246 @@ 
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
+# project.
+# ====================================================================
+
+# Poly1305 hash for MIPS.
+#
+# May 2016
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+#		IALU/gcc
+# R1x000	5.64/+120%	(big-endian)
+# Octeon II	3.80/+280%	(little-endian)
+#
+# March 2019
+#
+# Add 32-bit code path.
+#
+#		IALU/gcc
+# R1x000	10.3/?		(big-endian)
+# Octeon II	4.64/+90%	(little-endian)
+#
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
+#   excluded from the rule, because it's specified volatile];
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
+
+$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
+
+if ($flavour =~ /64|n32/i) {{{
+######################################################################
+# 64-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
+
+$code.=<<___;
+#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
+     defined(_MIPS_ARCH_MIPS64R6)) \\
+     && !defined(_MIPS_ARCH_MIPS64R2)
+# define _MIPS_ARCH_MIPS64R2
+#endif
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+# define dmultu(rs,rt)
+# define mflo(rd,rs,rt)	dmulu	rd,rs,rt
+# define mfhi(rd,rs,rt)	dmuhu	rd,rs,rt
+#else
+# define dmultu(rs,rt)		dmultu	rs,rt
+# define mflo(rd,rs,rt)	mflo	rd
+# define mfhi(rd,rs,rt)	mfhi	rd
+#endif
+
+#ifdef	__KERNEL__
+# define poly1305_init   poly1305_init_mips
+# define poly1305_blocks poly1305_blocks_mips
+# define poly1305_emit   poly1305_emit_mips
+#endif
+
+#if defined(__MIPSEB__) && !defined(MIPSEB)
+# define MIPSEB
+#endif
+
+#ifdef MIPSEB
+# define MSB 0
+# define LSB 7
+#else
+# define MSB 7
+# define LSB 0
+#endif
+
+.text
+.set	noat
+.set	noreorder
+
+.align	5
+.globl	poly1305_init
+.ent	poly1305_init
+poly1305_init:
+	.frame	$sp,0,$ra
+	.set	reorder
+
+	sd	$zero,0($ctx)
+	sd	$zero,8($ctx)
+	sd	$zero,16($ctx)
+
+	beqz	$inp,.Lno_key
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+	andi	$tmp0,$inp,7		# $inp % 8
+	dsubu	$inp,$inp,$tmp0		# align $inp
+	sll	$tmp0,$tmp0,3		# byte to bit offset
+	ld	$in0,0($inp)
+	ld	$in1,8($inp)
+	beqz	$tmp0,.Laligned_key
+	ld	$tmp2,16($inp)
+
+	subu	$tmp1,$zero,$tmp0
+# ifdef	MIPSEB
+	dsllv	$in0,$in0,$tmp0
+	dsrlv	$tmp3,$in1,$tmp1
+	dsllv	$in1,$in1,$tmp0
+	dsrlv	$tmp2,$tmp2,$tmp1
+# else
+	dsrlv	$in0,$in0,$tmp0
+	dsllv	$tmp3,$in1,$tmp1
+	dsrlv	$in1,$in1,$tmp0
+	dsllv	$tmp2,$tmp2,$tmp1
+# endif
+	or	$in0,$in0,$tmp3
+	or	$in1,$in1,$tmp2
+.Laligned_key:
+#else
+	ldl	$in0,0+MSB($inp)
+	ldl	$in1,8+MSB($inp)
+	ldr	$in0,0+LSB($inp)
+	ldr	$in1,8+LSB($inp)
+#endif
+#ifdef	MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+	dsbh	$in0,$in0		# byte swap
+	 dsbh	$in1,$in1
+	dshd	$in0,$in0
+	 dshd	$in1,$in1
+# else
+	ori	$tmp0,$zero,0xFF
+	dsll	$tmp2,$tmp0,32
+	or	$tmp0,$tmp2		# 0x000000FF000000FF
+
+	and	$tmp1,$in0,$tmp0	# byte swap
+	 and	$tmp3,$in1,$tmp0
+	dsrl	$tmp2,$in0,24
+	 dsrl	$tmp4,$in1,24
+	dsll	$tmp1,24
+	 dsll	$tmp3,24
+	and	$tmp2,$tmp0
+	 and	$tmp4,$tmp0
+	dsll	$tmp0,8			# 0x0000FF000000FF00
+	or	$tmp1,$tmp2
+	 or	$tmp3,$tmp4
+	and	$tmp2,$in0,$tmp0
+	 and	$tmp4,$in1,$tmp0
+	dsrl	$in0,8
+	 dsrl	$in1,8
+	dsll	$tmp2,8
+	 dsll	$tmp4,8
+	and	$in0,$tmp0
+	 and	$in1,$tmp0
+	or	$tmp1,$tmp2
+	 or	$tmp3,$tmp4
+	or	$in0,$tmp1
+	 or	$in1,$tmp3
+	dsrl	$tmp1,$in0,32
+	 dsrl	$tmp3,$in1,32
+	dsll	$in0,32
+	 dsll	$in1,32
+	or	$in0,$tmp1
+	 or	$in1,$tmp3
+# endif
+#endif
+	li	$tmp0,1
+	dsll	$tmp0,32		# 0x0000000100000000
+	daddiu	$tmp0,-63		# 0x00000000ffffffc1
+	dsll	$tmp0,28		# 0x0ffffffc10000000
+	daddiu	$tmp0,-1		# 0x0ffffffc0fffffff
+
+	and	$in0,$tmp0
+	daddiu	$tmp0,-3		# 0x0ffffffc0ffffffc
+	and	$in1,$tmp0
+
+	sd	$in0,24($ctx)
+	dsrl	$tmp0,$in1,2
+	sd	$in1,32($ctx)
+	daddu	$tmp0,$in1		# s1 = r1 + (r1 >> 2)
+	sd	$tmp0,40($ctx)
+
+.Lno_key:
+	li	$v0,0			# return 0
+	jr	$ra
+.end	poly1305_init
+___
+{
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
+
+my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
+   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
+my ($shr,$shl) = ($s6,$s7);		# used on R6
+
+$code.=<<___;
+.align	5
+.globl	poly1305_blocks
+.ent	poly1305_blocks
+poly1305_blocks:
+	.set	noreorder
+	dsrl	$len,4			# number of complete blocks
+	bnez	$len,poly1305_blocks_internal
+	nop
+	jr	$ra
+	nop
+.end	poly1305_blocks
+
+.align	5
+.ent	poly1305_blocks_internal
+poly1305_blocks_internal:
+	.frame	$sp,6*8,$ra
+	.mask	$SAVED_REGS_MASK,-8
+	.set	noreorder
+#if defined(_MIPS_ARCH_MIPS64R6)
+	dsubu	$sp,8*8
+	sd	$s7,56($sp)
+	sd	$s6,48($sp)
+#else
+	dsubu	$sp,6*8
+#endif
+	sd	$s5,40($sp)
+	sd	$s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	sd	$s3,24($sp)
+	sd	$s2,16($sp)
+	sd	$s1,8($sp)
+	sd	$s0,0($sp)
+___
+$code.=<<___;
+	.set	reorder
+
+	ld	$h0,0($ctx)		# load hash value
+	ld	$h1,8($ctx)
+	ld	$h2,16($ctx)
+
+	ld	$r0,24($ctx)		# load key
+	ld	$r1,32($ctx)
+	ld	$rs1,40($ctx)
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+	andi	$shr,$inp,7
+	dsubu	$inp,$inp,$shr		# align $inp
+	sll	$shr,$shr,3		# byte to bit offset
+	subu	$shl,$zero,$shr
+#endif
+
+.Loop:
+#if defined(_MIPS_ARCH_MIPS64R6)
+	ld	$in0,0($inp)		# load input
+	ld	$in1,8($inp)
+	beqz	$shr,.Laligned_inp
+	ld	$tmp2,16($inp)
+
+# ifdef	MIPSEB
+	dsllv	$in0,$in0,$shr
+	dsrlv	$tmp3,$in1,$shl
+	dsllv	$in1,$in1,$shr
+	dsrlv	$tmp2,$tmp2,$shl
+# else
+	dsrlv	$in0,$in0,$shr
+	dsllv	$tmp3,$in1,$shl
+	dsrlv	$in1,$in1,$shr
+	dsllv	$tmp2,$tmp2,$shl
+# endif
+	or	$in0,$in0,$tmp3
+	or	$in1,$in1,$tmp2
+.Laligned_inp:
+#else
+	ldl	$in0,0+MSB($inp)	# load input
+	ldl	$in1,8+MSB($inp)
+	ldr	$in0,0+LSB($inp)
+	ldr	$in1,8+LSB($inp)
+#endif
+	daddiu	$len,-1
+	daddiu	$inp,16
+#ifdef	MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+	dsbh	$in0,$in0		# byte swap
+	 dsbh	$in1,$in1
+	dshd	$in0,$in0
+	 dshd	$in1,$in1
+# else
+	ori	$tmp0,$zero,0xFF
+	dsll	$tmp2,$tmp0,32
+	or	$tmp0,$tmp2		# 0x000000FF000000FF
+
+	and	$tmp1,$in0,$tmp0	# byte swap
+	 and	$tmp3,$in1,$tmp0
+	dsrl	$tmp2,$in0,24
+	 dsrl	$tmp4,$in1,24
+	dsll	$tmp1,24
+	 dsll	$tmp3,24
+	and	$tmp2,$tmp0
+	 and	$tmp4,$tmp0
+	dsll	$tmp0,8			# 0x0000FF000000FF00
+	or	$tmp1,$tmp2
+	 or	$tmp3,$tmp4
+	and	$tmp2,$in0,$tmp0
+	 and	$tmp4,$in1,$tmp0
+	dsrl	$in0,8
+	 dsrl	$in1,8
+	dsll	$tmp2,8
+	 dsll	$tmp4,8
+	and	$in0,$tmp0
+	 and	$in1,$tmp0
+	or	$tmp1,$tmp2
+	 or	$tmp3,$tmp4
+	or	$in0,$tmp1
+	 or	$in1,$tmp3
+	dsrl	$tmp1,$in0,32
+	 dsrl	$tmp3,$in1,32
+	dsll	$in0,32
+	 dsll	$in1,32
+	or	$in0,$tmp1
+	 or	$in1,$tmp3
+# endif
+#endif
+	daddu	$h0,$in0		# accumulate input
+	daddu	$h1,$in1
+	sltu	$tmp0,$h0,$in0
+	sltu	$tmp1,$h1,$in1
+	daddu	$h1,$tmp0
+
+	dmultu	($r0,$h0)		# h0*r0
+	 daddu	$h2,$padbit
+	 sltu	$tmp0,$h1,$tmp0
+	mflo	($d0,$r0,$h0)
+	mfhi	($d1,$r0,$h0)
+
+	dmultu	($rs1,$h1)		# h1*5*r1
+	 daddu	$tmp0,$tmp1
+	 daddu	$h2,$tmp0
+	mflo	($tmp0,$rs1,$h1)
+	mfhi	($tmp1,$rs1,$h1)
+
+	dmultu	($r1,$h0)		# h0*r1
+	 daddu	$d0,$tmp0
+	 daddu	$d1,$tmp1
+	mflo	($tmp2,$r1,$h0)
+	mfhi	($d2,$r1,$h0)
+	 sltu	$tmp0,$d0,$tmp0
+	 daddu	$d1,$tmp0
+
+	dmultu	($r0,$h1)		# h1*r0
+	 daddu	$d1,$tmp2
+	 sltu	$tmp2,$d1,$tmp2
+	mflo	($tmp0,$r0,$h1)
+	mfhi	($tmp1,$r0,$h1)
+	 daddu	$d2,$tmp2
+
+	dmultu	($rs1,$h2)		# h2*5*r1
+	 daddu	$d1,$tmp0
+	 daddu	$d2,$tmp1
+	mflo	($tmp2,$rs1,$h2)
+
+	dmultu	($r0,$h2)		# h2*r0
+	 sltu	$tmp0,$d1,$tmp0
+	 daddu	$d2,$tmp0
+	mflo	($tmp3,$r0,$h2)
+
+	daddu	$d1,$tmp2
+	daddu	$d2,$tmp3
+	sltu	$tmp2,$d1,$tmp2
+	daddu	$d2,$tmp2
+
+	li	$tmp0,-4		# final reduction
+	and	$tmp0,$d2
+	dsrl	$tmp1,$d2,2
+	andi	$h2,$d2,3
+	daddu	$tmp0,$tmp1
+	daddu	$h0,$d0,$tmp0
+	sltu	$tmp0,$h0,$tmp0
+	daddu	$h1,$d1,$tmp0
+	sltu	$tmp0,$h1,$tmp0
+	daddu	$h2,$h2,$tmp0
+
+	bnez	$len,.Loop
+
+	sd	$h0,0($ctx)		# store hash value
+	sd	$h1,8($ctx)
+	sd	$h2,16($ctx)
+
+	.set	noreorder
+#if defined(_MIPS_ARCH_MIPS64R6)
+	ld	$s7,56($sp)
+	ld	$s6,48($sp)
+#endif
+	ld	$s5,40($sp)		# epilogue
+	ld	$s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi epilogue
+	ld	$s3,24($sp)
+	ld	$s2,16($sp)
+	ld	$s1,8($sp)
+	ld	$s0,0($sp)
+___
+$code.=<<___;
+	jr	$ra
+#if defined(_MIPS_ARCH_MIPS64R6)
+	daddu	$sp,8*8
+#else
+	daddu	$sp,6*8
+#endif
+.end	poly1305_blocks_internal
+___
+}
+{
+my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
+
+$code.=<<___;
+.align	5
+.globl	poly1305_emit
+.ent	poly1305_emit
+poly1305_emit:
+	.frame	$sp,0,$ra
+	.set	reorder
+
+	ld	$tmp0,0($ctx)
+	ld	$tmp1,8($ctx)
+	ld	$tmp2,16($ctx)
+
+	daddiu	$in0,$tmp0,5		# compare to modulus
+	sltiu	$tmp3,$in0,5
+	daddu	$in1,$tmp1,$tmp3
+	sltu	$tmp3,$in1,$tmp3
+	daddu	$tmp2,$tmp2,$tmp3
+
+	dsrl	$tmp2,2			# see if it carried/borrowed
+	dsubu	$tmp2,$zero,$tmp2
+
+	xor	$in0,$tmp0
+	xor	$in1,$tmp1
+	and	$in0,$tmp2
+	and	$in1,$tmp2
+	xor	$in0,$tmp0
+	xor	$in1,$tmp1
+
+	lwu	$tmp0,0($nonce)		# load nonce
+	lwu	$tmp1,4($nonce)
+	lwu	$tmp2,8($nonce)
+	lwu	$tmp3,12($nonce)
+	dsll	$tmp1,32
+	dsll	$tmp3,32
+	or	$tmp0,$tmp1
+	or	$tmp2,$tmp3
+
+	daddu	$in0,$tmp0		# accumulate nonce
+	daddu	$in1,$tmp2
+	sltu	$tmp0,$in0,$tmp0
+	daddu	$in1,$tmp0
+
+	dsrl	$tmp0,$in0,8		# write mac value
+	dsrl	$tmp1,$in0,16
+	dsrl	$tmp2,$in0,24
+	sb	$in0,0($mac)
+	dsrl	$tmp3,$in0,32
+	sb	$tmp0,1($mac)
+	dsrl	$tmp0,$in0,40
+	sb	$tmp1,2($mac)
+	dsrl	$tmp1,$in0,48
+	sb	$tmp2,3($mac)
+	dsrl	$tmp2,$in0,56
+	sb	$tmp3,4($mac)
+	dsrl	$tmp3,$in1,8
+	sb	$tmp0,5($mac)
+	dsrl	$tmp0,$in1,16
+	sb	$tmp1,6($mac)
+	dsrl	$tmp1,$in1,24
+	sb	$tmp2,7($mac)
+
+	sb	$in1,8($mac)
+	dsrl	$tmp2,$in1,32
+	sb	$tmp3,9($mac)
+	dsrl	$tmp3,$in1,40
+	sb	$tmp0,10($mac)
+	dsrl	$tmp0,$in1,48
+	sb	$tmp1,11($mac)
+	dsrl	$tmp1,$in1,56
+	sb	$tmp2,12($mac)
+	sb	$tmp3,13($mac)
+	sb	$tmp0,14($mac)
+	sb	$tmp1,15($mac)
+
+	jr	$ra
+.end	poly1305_emit
+.rdata
+.asciiz	"Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
+.align	2
+___
+}
+}}} else {{{
+######################################################################
+# 32-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
+   ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
+
+$code.=<<___;
+#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
+     defined(_MIPS_ARCH_MIPS32R6)) \\
+     && !defined(_MIPS_ARCH_MIPS32R2)
+# define _MIPS_ARCH_MIPS32R2
+#endif
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+# define multu(rs,rt)
+# define mflo(rd,rs,rt)	mulu	rd,rs,rt
+# define mfhi(rd,rs,rt)	muhu	rd,rs,rt
+#else
+# define multu(rs,rt)	multu	rs,rt
+# define mflo(rd,rs,rt)	mflo	rd
+# define mfhi(rd,rs,rt)	mfhi	rd
+#endif
+
+#ifdef	__KERNEL__
+# define poly1305_init   poly1305_init_mips
+# define poly1305_blocks poly1305_blocks_mips
+# define poly1305_emit   poly1305_emit_mips
+#endif
+
+#if defined(__MIPSEB__) && !defined(MIPSEB)
+# define MIPSEB
+#endif
+
+#ifdef MIPSEB
+# define MSB 0
+# define LSB 3
+#else
+# define MSB 3
+# define LSB 0
+#endif
+
+.text
+.set	noat
+.set	noreorder
+
+.align	5
+.globl	poly1305_init
+.ent	poly1305_init
+poly1305_init:
+	.frame	$sp,0,$ra
+	.set	reorder
+
+	sw	$zero,0($ctx)
+	sw	$zero,4($ctx)
+	sw	$zero,8($ctx)
+	sw	$zero,12($ctx)
+	sw	$zero,16($ctx)
+
+	beqz	$inp,.Lno_key
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+	andi	$tmp0,$inp,3		# $inp % 4
+	subu	$inp,$inp,$tmp0		# align $inp
+	sll	$tmp0,$tmp0,3		# byte to bit offset
+	lw	$in0,0($inp)
+	lw	$in1,4($inp)
+	lw	$in2,8($inp)
+	lw	$in3,12($inp)
+	beqz	$tmp0,.Laligned_key
+
+	lw	$tmp2,16($inp)
+	subu	$tmp1,$zero,$tmp0
+# ifdef	MIPSEB
+	sllv	$in0,$in0,$tmp0
+	srlv	$tmp3,$in1,$tmp1
+	sllv	$in1,$in1,$tmp0
+	or	$in0,$in0,$tmp3
+	srlv	$tmp3,$in2,$tmp1
+	sllv	$in2,$in2,$tmp0
+	or	$in1,$in1,$tmp3
+	srlv	$tmp3,$in3,$tmp1
+	sllv	$in3,$in3,$tmp0
+	or	$in2,$in2,$tmp3
+	srlv	$tmp2,$tmp2,$tmp1
+	or	$in3,$in3,$tmp2
+# else
+	srlv	$in0,$in0,$tmp0
+	sllv	$tmp3,$in1,$tmp1
+	srlv	$in1,$in1,$tmp0
+	or	$in0,$in0,$tmp3
+	sllv	$tmp3,$in2,$tmp1
+	srlv	$in2,$in2,$tmp0
+	or	$in1,$in1,$tmp3
+	sllv	$tmp3,$in3,$tmp1
+	srlv	$in3,$in3,$tmp0
+	or	$in2,$in2,$tmp3
+	sllv	$tmp2,$tmp2,$tmp1
+	or	$in3,$in3,$tmp2
+# endif
+.Laligned_key:
+#else
+	lwl	$in0,0+MSB($inp)
+	lwl	$in1,4+MSB($inp)
+	lwl	$in2,8+MSB($inp)
+	lwl	$in3,12+MSB($inp)
+	lwr	$in0,0+LSB($inp)
+	lwr	$in1,4+LSB($inp)
+	lwr	$in2,8+LSB($inp)
+	lwr	$in3,12+LSB($inp)
+#endif
+#ifdef	MIPSEB
+# if defined(_MIPS_ARCH_MIPS32R2)
+	wsbh	$in0,$in0		# byte swap
+	wsbh	$in1,$in1
+	wsbh	$in2,$in2
+	wsbh	$in3,$in3
+	rotr	$in0,$in0,16
+	rotr	$in1,$in1,16
+	rotr	$in2,$in2,16
+	rotr	$in3,$in3,16
+# else
+	srl	$tmp0,$in0,24		# byte swap
+	srl	$tmp1,$in0,8
+	andi	$tmp2,$in0,0xFF00
+	sll	$in0,$in0,24
+	andi	$tmp1,0xFF00
+	sll	$tmp2,$tmp2,8
+	or	$in0,$tmp0
+	 srl	$tmp0,$in1,24
+	or	$tmp1,$tmp2
+	 srl	$tmp2,$in1,8
+	or	$in0,$tmp1
+	 andi	$tmp1,$in1,0xFF00
+	 sll	$in1,$in1,24
+	 andi	$tmp2,0xFF00
+	 sll	$tmp1,$tmp1,8
+	 or	$in1,$tmp0
+	srl	$tmp0,$in2,24
+	 or	$tmp2,$tmp1
+	srl	$tmp1,$in2,8
+	 or	$in1,$tmp2
+	andi	$tmp2,$in2,0xFF00
+	sll	$in2,$in2,24
+	andi	$tmp1,0xFF00
+	sll	$tmp2,$tmp2,8
+	or	$in2,$tmp0
+	 srl	$tmp0,$in3,24
+	or	$tmp1,$tmp2
+	 srl	$tmp2,$in3,8
+	or	$in2,$tmp1
+	 andi	$tmp1,$in3,0xFF00
+	 sll	$in3,$in3,24
+	 andi	$tmp2,0xFF00
+	 sll	$tmp1,$tmp1,8
+	 or	$in3,$tmp0
+	 or	$tmp2,$tmp1
+	 or	$in3,$tmp2
+# endif
+#endif
+	lui	$tmp0,0x0fff
+	ori	$tmp0,0xffff		# 0x0fffffff
+	and	$in0,$in0,$tmp0
+	subu	$tmp0,3			# 0x0ffffffc
+	and	$in1,$in1,$tmp0
+	and	$in2,$in2,$tmp0
+	and	$in3,$in3,$tmp0
+
+	sw	$in0,20($ctx)
+	sw	$in1,24($ctx)
+	sw	$in2,28($ctx)
+	sw	$in3,32($ctx)
+
+	srl	$tmp1,$in1,2
+	srl	$tmp2,$in2,2
+	srl	$tmp3,$in3,2
+	addu	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
+	addu	$in2,$in2,$tmp2
+	addu	$in3,$in3,$tmp3
+	sw	$in1,36($ctx)
+	sw	$in2,40($ctx)
+	sw	$in3,44($ctx)
+.Lno_key:
+	li	$v0,0
+	jr	$ra
+.end	poly1305_init
+___
+{
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
+
+my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
+   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
+my ($d0,$d1,$d2,$d3) =
+   ($a4,$a5,$a6,$a7);
+my $shr = $t2;		# used on R6
+
+$code.=<<___;
+.globl	poly1305_blocks
+.align	5
+.ent	poly1305_blocks
+poly1305_blocks:
+	.frame	$sp,16*4,$ra
+	.mask	$SAVED_REGS_MASK,-4
+	.set	noreorder
+	subu	$sp, $sp,4*12
+	sw	$s11,4*11($sp)
+	sw	$s10,4*10($sp)
+	sw	$s9, 4*9($sp)
+	sw	$s8, 4*8($sp)
+	sw	$s7, 4*7($sp)
+	sw	$s6, 4*6($sp)
+	sw	$s5, 4*5($sp)
+	sw	$s4, 4*4($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	sw	$s3, 4*3($sp)
+	sw	$s2, 4*2($sp)
+	sw	$s1, 4*1($sp)
+	sw	$s0, 4*0($sp)
+___
+$code.=<<___;
+	.set	reorder
+
+	srl	$len,4			# number of complete blocks
+	beqz	$len,.Labort
+
+	lw	$h0,0($ctx)		# load hash value
+	lw	$h1,4($ctx)
+	lw	$h2,8($ctx)
+	lw	$h3,12($ctx)
+	lw	$h4,16($ctx)
+
+	lw	$r0,20($ctx)		# load key
+	lw	$r1,24($ctx)
+	lw	$r2,28($ctx)
+	lw	$r3,32($ctx)
+	lw	$rs1,36($ctx)
+	lw	$rs2,40($ctx)
+	lw	$rs3,44($ctx)
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+	andi	$shr,$inp,3
+	subu	$inp,$inp,$shr		# align $inp
+	sll	$shr,$shr,3		# byte to bit offset
+#endif
+
+.Loop:
+#if defined(_MIPS_ARCH_MIPS32R6)
+	lw	$d0,0($inp)		# load input
+	lw	$d1,4($inp)
+	lw	$d2,8($inp)
+	lw	$d3,12($inp)
+	beqz	$shr,.Laligned_inp
+
+	lw	$t0,16($inp)
+	subu	$t1,$zero,$shr
+# ifdef	MIPSEB
+	sllv	$d0,$d0,$shr
+	srlv	$at,$d1,$t1
+	sllv	$d1,$d1,$shr
+	or	$d0,$d0,$at
+	srlv	$at,$d2,$t1
+	sllv	$d2,$d2,$shr
+	or	$d1,$d1,$at
+	srlv	$at,$d3,$t1
+	sllv	$d3,$d3,$shr
+	or	$d2,$d2,$at
+	srlv	$t0,$t0,$t1
+	or	$d3,$d3,$t0
+# else
+	srlv	$d0,$d0,$shr
+	sllv	$at,$d1,$t1
+	srlv	$d1,$d1,$shr
+	or	$d0,$d0,$at
+	sllv	$at,$d2,$t1
+	srlv	$d2,$d2,$shr
+	or	$d1,$d1,$at
+	sllv	$at,$d3,$t1
+	srlv	$d3,$d3,$shr
+	or	$d2,$d2,$at
+	sllv	$t0,$t0,$t1
+	or	$d3,$d3,$t0
+# endif
+.Laligned_inp:
+#else
+	lwl	$d0,0+MSB($inp)		# load input
+	lwl	$d1,4+MSB($inp)
+	lwl	$d2,8+MSB($inp)
+	lwl	$d3,12+MSB($inp)
+	lwr	$d0,0+LSB($inp)
+	lwr	$d1,4+LSB($inp)
+	lwr	$d2,8+LSB($inp)
+	lwr	$d3,12+LSB($inp)
+#endif
+	addiu	$len,$len,-1
+	addiu	$inp,$inp,16
+#ifdef	MIPSEB
+# if defined(_MIPS_ARCH_MIPS32R2)
+	wsbh	$d0,$d0			# byte swap
+	wsbh	$d1,$d1
+	wsbh	$d2,$d2
+	wsbh	$d3,$d3
+	rotr	$d0,$d0,16
+	rotr	$d1,$d1,16
+	rotr	$d2,$d2,16
+	rotr	$d3,$d3,16
+# else
+	srl	$at,$d0,24		# byte swap
+	srl	$t0,$d0,8
+	andi	$t1,$d0,0xFF00
+	sll	$d0,$d0,24
+	andi	$t0,0xFF00
+	sll	$t1,$t1,8
+	or	$d0,$at
+	 srl	$at,$d1,24
+	or	$t0,$t1
+	 srl	$t1,$d1,8
+	or	$d0,$t0
+	 andi	$t0,$d1,0xFF00
+	 sll	$d1,$d1,24
+	 andi	$t1,0xFF00
+	 sll	$t0,$t0,8
+	 or	$d1,$at
+	srl	$at,$d2,24
+	 or	$t1,$t0
+	srl	$t0,$d2,8
+	 or	$d1,$t1
+	andi	$t1,$d2,0xFF00
+	sll	$d2,$d2,24
+	andi	$t0,0xFF00
+	sll	$t1,$t1,8
+	or	$d2,$at
+	 srl	$at,$d3,24
+	or	$t0,$t1
+	 srl	$t1,$d3,8
+	or	$d2,$t0
+	 andi	$t0,$d3,0xFF00
+	 sll	$d3,$d3,24
+	 andi	$t1,0xFF00
+	 sll	$t0,$t0,8
+	 or	$d3,$at
+	 or	$t1,$t0
+	 or	$d3,$t1
+# endif
+#endif
+	addu	$h0,$h0,$d0		# accumulate input
+	sltu	$d0,$h0,$d0		# carry
+
+	addu	$h1,$h1,$d1
+	sltu	$d1,$h1,$d1
+	addu	$h1,$h1,$d0
+	sltu	$d0,$h1,$d0
+	addu	$d1,$d1,$d0		# carry
+
+	addu	$h2,$h2,$d2
+	sltu	$d2,$h2,$d2
+	addu	$h2,$h2,$d1
+	sltu	$d1,$h2,$d1
+	addu	$d2,$d2,$d1		# carry
+
+	addu	$h3,$h3,$d3
+	sltu	$d3,$h3,$d3
+	addu	$h3,$h3,$d2
+
+#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
+	multu	$r0,$h0			# h0*r0
+	maddu	$rs3,$h1		# h1*s3
+	maddu	$rs2,$h2		# h2*s2
+	maddu	$rs1,$h3		# h3*s1
+	mflo	$d0
+	mfhi	$at
+
+	 sltu	$d2,$h3,$d2
+	 addu	$d3,$d3,$d2		# carry
+	 addu	$h4,$h4,$padbit
+	 addu	$h4,$h4,$d3
+
+	multu	$r1,$h0			# h0*r1
+	maddu	$r0,$h1			# h1*r0
+	maddu	$rs3,$h2		# h2*s3
+	maddu	$rs2,$h3		# h3*s2
+	maddu	$rs1,$h4		# h4*s1
+	mflo	$d1
+	mfhi	$d2
+
+	multu	$r2,$h0			# h0*r2
+	maddu	$r1,$h1			# h1*r1
+	maddu	$r0,$h2			# h2*r0
+	maddu	$rs3,$h3		# h3*s3
+	maddu	$rs2,$h4		# h4*s2
+	 addu	$d1,$d1,$at
+	 sltu	$at,$d1,$at
+	 addu	$at,$d2,$at
+	mflo	$d2
+	mfhi	$d3
+
+	multu	$r3,$h0			# h0*r3
+	maddu	$r2,$h1			# h1*r2
+	maddu	$r1,$h2			# h2*r1
+	maddu	$r0,$h3			# h3*r0
+	maddu	$rs3,$h4		# h4*s3
+	 addu	$d2,$d2,$at
+	 sltu	$at,$d2,$at
+	 addu	$d3,$d3,$at
+	mflo	$h3
+	mfhi	$at
+
+	multu	$r0,$h4			# h4*r0
+	 addu	$h3,$h3,$d3
+	 sltu	$d3,$h3,$d3
+	 addu	$at,$d3,$at
+	mflo	$h4
+
+	addu	$h4,$at,$h4
+#else
+	multu	($r0,$h0)		# h0*r0
+	mflo	($d0,$r0,$h0)
+	mfhi	($d1,$r0,$h0)
+
+	 sltu	$d2,$h3,$d2
+	 addu	$d3,$d3,$d2		# carry
+
+	multu	($rs3,$h1)		# h1*s3
+	mflo	($at,$rs3,$h1)
+	mfhi	($t0,$rs3,$h1)
+
+	 addu	$h4,$h4,$padbit
+	 addu	$h4,$h4,$d3
+
+	multu	($rs2,$h2)		# h2*s2
+	mflo	($a3,$rs2,$h2)
+	mfhi	($t1,$rs2,$h2)
+	 addu	$d0,$d0,$at
+	 addu	$d1,$d1,$t0
+	multu	($rs1,$h3)		# h3*s1
+	 sltu	$at,$d0,$at
+	 addu	$d1,$d1,$at
+
+	mflo	($at,$rs1,$h3)
+	mfhi	($t0,$rs1,$h3)
+	 addu	$d0,$d0,$a3
+	 addu	$d1,$d1,$t1
+	multu	($r1,$h0)		# h0*r1
+	 sltu	$a3,$d0,$a3
+	 addu	$d1,$d1,$a3
+
+
+	mflo	($a3,$r1,$h0)
+	mfhi	($d2,$r1,$h0)
+	 addu	$d0,$d0,$at
+	 addu	$d1,$d1,$t0
+	multu	($r0,$h1)		# h1*r0
+	 sltu	$at,$d0,$at
+	 addu	$d1,$d1,$at
+
+	mflo	($at,$r0,$h1)
+	mfhi	($t0,$r0,$h1)
+	 addu	$d1,$d1,$a3
+	 sltu	$a3,$d1,$a3
+	multu	($rs3,$h2)		# h2*s3
+	 addu	$d2,$d2,$a3
+
+	mflo	($a3,$rs3,$h2)
+	mfhi	($t1,$rs3,$h2)
+	 addu	$d1,$d1,$at
+	 addu	$d2,$d2,$t0
+	multu	($rs2,$h3)		# h3*s2
+	 sltu	$at,$d1,$at
+	 addu	$d2,$d2,$at
+
+	mflo	($at,$rs2,$h3)
+	mfhi	($t0,$rs2,$h3)
+	 addu	$d1,$d1,$a3
+	 addu	$d2,$d2,$t1
+	multu	($rs1,$h4)		# h4*s1
+	 sltu	$a3,$d1,$a3
+	 addu	$d2,$d2,$a3
+
+	mflo	($a3,$rs1,$h4)
+	 addu	$d1,$d1,$at
+	 addu	$d2,$d2,$t0
+	multu	($r2,$h0)		# h0*r2
+	 sltu	$at,$d1,$at
+	 addu	$d2,$d2,$at
+
+
+	mflo	($at,$r2,$h0)
+	mfhi	($d3,$r2,$h0)
+	 addu	$d1,$d1,$a3
+	 sltu	$a3,$d1,$a3
+	multu	($r1,$h1)		# h1*r1
+	 addu	$d2,$d2,$a3
+
+	mflo	($a3,$r1,$h1)
+	mfhi	($t1,$r1,$h1)
+	 addu	$d2,$d2,$at
+	 sltu	$at,$d2,$at
+	multu	($r0,$h2)		# h2*r0
+	 addu	$d3,$d3,$at
+
+	mflo	($at,$r0,$h2)
+	mfhi	($t0,$r0,$h2)
+	 addu	$d2,$d2,$a3
+	 addu	$d3,$d3,$t1
+	multu	($rs3,$h3)		# h3*s3
+	 sltu	$a3,$d2,$a3
+	 addu	$d3,$d3,$a3
+
+	mflo	($a3,$rs3,$h3)
+	mfhi	($t1,$rs3,$h3)
+	 addu	$d2,$d2,$at
+	 addu	$d3,$d3,$t0
+	multu	($rs2,$h4)		# h4*s2
+	 sltu	$at,$d2,$at
+	 addu	$d3,$d3,$at
+
+	mflo	($at,$rs2,$h4)
+	 addu	$d2,$d2,$a3
+	 addu	$d3,$d3,$t1
+	multu	($r3,$h0)		# h0*r3
+	 sltu	$a3,$d2,$a3
+	 addu	$d3,$d3,$a3
+
+
+	mflo	($a3,$r3,$h0)
+	mfhi	($t1,$r3,$h0)
+	 addu	$d2,$d2,$at
+	 sltu	$at,$d2,$at
+	multu	($r2,$h1)		# h1*r2
+	 addu	$d3,$d3,$at
+
+	mflo	($at,$r2,$h1)
+	mfhi	($t0,$r2,$h1)
+	 addu	$d3,$d3,$a3
+	 sltu	$a3,$d3,$a3
+	multu	($r0,$h3)		# h3*r0
+	 addu	$t1,$t1,$a3
+
+	mflo	($a3,$r0,$h3)
+	mfhi	($h3,$r0,$h3)
+	 addu	$d3,$d3,$at
+	 addu	$t1,$t1,$t0
+	multu	($r1,$h2)		# h2*r1
+	 sltu	$at,$d3,$at
+	 addu	$t1,$t1,$at
+
+	mflo	($at,$r1,$h2)
+	mfhi	($t0,$r1,$h2)
+	 addu	$d3,$d3,$a3
+	 addu	$t1,$t1,$h3
+	multu	($rs3,$h4)		# h4*s3
+	 sltu	$a3,$d3,$a3
+	 addu	$t1,$t1,$a3
+
+	mflo	($a3,$rs3,$h4)
+	 addu	$d3,$d3,$at
+	 addu	$t1,$t1,$t0
+	multu	($r0,$h4)		# h4*r0
+	 sltu	$at,$d3,$at
+	 addu	$t1,$t1,$at
+
+
+	mflo	($h4,$r0,$h4)
+	 addu	$h3,$d3,$a3
+	 sltu	$a3,$h3,$a3
+	 addu	$t1,$t1,$a3
+	addu	$h4,$t1,$h4
+
+	li	$padbit,1		# if we loop, padbit is 1
+#endif
+
+	li	$at,-4			# final reduction
+	srl	$h0,$h4,2
+	and	$at,$at,$h4
+	andi	$h4,$h4,3
+	addu	$h0,$h0,$at
+
+	addu	$h0,$h0,$d0
+	sltu	$at,$h0,$d0
+	addu	$h1,$d1,$at
+	sltu	$at,$h1,$at
+	addu	$h2,$d2,$at
+	sltu	$at,$h2,$at
+	addu	$h3,$h3,$at
+	sltu	$at,$h3,$at
+	addu	$h4,$h4,$at
+
+	bnez	$len,.Loop
+
+	sw	$h0,0($ctx)		# store hash value
+	sw	$h1,4($ctx)
+	sw	$h2,8($ctx)
+	sw	$h3,12($ctx)
+	sw	$h4,16($ctx)
+
+	.set	noreorder
+.Labort:
+	lw	$s11,4*11($sp)
+	lw	$s10,4*10($sp)
+	lw	$s9, 4*9($sp)
+	lw	$s8, 4*8($sp)
+	lw	$s7, 4*7($sp)
+	lw	$s6, 4*6($sp)
+	lw	$s5, 4*5($sp)
+	lw	$s4, 4*4($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	lw	$s3, 4*3($sp)
+	lw	$s2, 4*2($sp)
+	lw	$s1, 4*1($sp)
+	lw	$s0, 4*0($sp)
+___
+$code.=<<___;
+	jr	$ra
+	addu	$sp,$sp,4*12
+.end	poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
+
+$code.=<<___;
+.align	5
+.globl	poly1305_emit
+.ent	poly1305_emit
+poly1305_emit:
+	.frame	$sp,0,$ra
+	.set	reorder
+
+	lw	$tmp0,0($ctx)
+	lw	$tmp1,4($ctx)
+	lw	$tmp2,8($ctx)
+	lw	$tmp3,12($ctx)
+	lw	$tmp4,16($ctx)
+
+	addiu	$in0,$tmp0,5		# compare to modulus
+	sltiu	$ctx,$in0,5
+	addu	$in1,$tmp1,$ctx
+	sltu	$ctx,$in1,$ctx
+	addu	$in2,$tmp2,$ctx
+	sltu	$ctx,$in2,$ctx
+	addu	$in3,$tmp3,$ctx
+	sltu	$ctx,$in3,$ctx
+	addu	$ctx,$tmp4
+
+	srl	$ctx,2			# see if it carried/borrowed
+	subu	$ctx,$zero,$ctx
+
+	xor	$in0,$tmp0
+	xor	$in1,$tmp1
+	xor	$in2,$tmp2
+	xor	$in3,$tmp3
+	and	$in0,$ctx
+	and	$in1,$ctx
+	and	$in2,$ctx
+	and	$in3,$ctx
+	xor	$in0,$tmp0
+	xor	$in1,$tmp1
+	xor	$in2,$tmp2
+	xor	$in3,$tmp3
+
+	lw	$tmp0,0($nonce)		# load nonce
+	lw	$tmp1,4($nonce)
+	lw	$tmp2,8($nonce)
+	lw	$tmp3,12($nonce)
+
+	addu	$in0,$tmp0		# accumulate nonce
+	sltu	$ctx,$in0,$tmp0
+
+	addu	$in1,$tmp1
+	sltu	$tmp1,$in1,$tmp1
+	addu	$in1,$ctx
+	sltu	$ctx,$in1,$ctx
+	addu	$ctx,$tmp1
+
+	addu	$in2,$tmp2
+	sltu	$tmp2,$in2,$tmp2
+	addu	$in2,$ctx
+	sltu	$ctx,$in2,$ctx
+	addu	$ctx,$tmp2
+
+	addu	$in3,$tmp3
+	addu	$in3,$ctx
+
+	srl	$tmp0,$in0,8		# write mac value
+	srl	$tmp1,$in0,16
+	srl	$tmp2,$in0,24
+	sb	$in0, 0($mac)
+	sb	$tmp0,1($mac)
+	srl	$tmp0,$in1,8
+	sb	$tmp1,2($mac)
+	srl	$tmp1,$in1,16
+	sb	$tmp2,3($mac)
+	srl	$tmp2,$in1,24
+	sb	$in1, 4($mac)
+	sb	$tmp0,5($mac)
+	srl	$tmp0,$in2,8
+	sb	$tmp1,6($mac)
+	srl	$tmp1,$in2,16
+	sb	$tmp2,7($mac)
+	srl	$tmp2,$in2,24
+	sb	$in2, 8($mac)
+	sb	$tmp0,9($mac)
+	srl	$tmp0,$in3,8
+	sb	$tmp1,10($mac)
+	srl	$tmp1,$in3,16
+	sb	$tmp2,11($mac)
+	srl	$tmp2,$in3,24
+	sb	$in3, 12($mac)
+	sb	$tmp0,13($mac)
+	sb	$tmp1,14($mac)
+	sb	$tmp2,15($mac)
+
+	jr	$ra
+.end	poly1305_emit
+.rdata
+.asciiz	"Poly1305 for MIPS, CRYPTOGAMS by \@dot-asm"
+.align	2
+___
+}
+}}}
+
+$output=pop and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/crypto/Kconfig b/crypto/Kconfig
index ddda2bcdf5b7..49ffa9babfc5 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -687,6 +687,7 @@  config CRYPTO_ARCH_HAVE_LIB_POLY1305
 
 config CRYPTO_LIB_POLY1305_RSIZE
 	int
+	default 2 if MIPS
 	default 4 if X86_64
 	default 9 if ARM || ARM64
 	default 1
@@ -722,6 +723,11 @@  config CRYPTO_POLY1305_X86_64
 	  in IETF protocols. This is the x86_64 assembler implementation using SIMD
 	  instructions.
 
+config CRYPTO_POLY1305_MIPS
+	tristate "Poly1305 authenticator algorithm (MIPS optimized)"
+	depends on CPU_MIPS32 || (CPU_MIPS64 && 64BIT)
+	select CRYPTO_ARCH_HAVE_LIB_POLY1305
+
 config CRYPTO_MD4
 	tristate "MD4 digest algorithm"
 	select CRYPTO_HASH