[edk2,v3,3/3] MdePkg/BaseMemoryLibOptDxe: add accelerated AARCH64 routines

Message ID	1473230862-20689-4-git-send-email-ard.biesheuvel@linaro.org
State	Superseded
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: best guess record for domain of edk2-devel-bounces@lists.01.org designates 198.145.21.10 as permitted sender) client-ip=198.145.21.10; From: Ard Biesheuvel <ard.biesheuvel@linaro.org> To: edk2-devel@lists.01.org, leif.lindholm@linaro.org, liming.gao@intel.com Date: Wed, 7 Sep 2016 07:47:42 +0100 Message-Id: <1473230862-20689-4-git-send-email-ard.biesheuvel@linaro.org> In-Reply-To: <1473230862-20689-1-git-send-email-ard.biesheuvel@linaro.org> References: <1473230862-20689-1-git-send-email-ard.biesheuvel@linaro.org> Subject: [edk2] [PATCH v3 3/3] MdePkg/BaseMemoryLibOptDxe: add accelerated AARCH64 routines Precedence: list Cc: michael.d.kinney@intel.com, Ard Biesheuvel <ard.biesheuvel@linaro.org> MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: edk2-devel-bounces@lists.01.org Sender: "edk2-devel" <edk2-devel-bounces@lists.01.org>

diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S new file mode 100644 index 000000000000..a54de6948be1 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S @@ -0,0 +1,142 @@ +// +// Copyright (c) 2013, Linaro Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of the Linaro nor the +// names of its contributors may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +// Assumptions: +// +// ARMv8-a, AArch64 +// + + +// Parameters and result. +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result x0 + +// Internal variables. +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define data2w w4 +#define diff x6 +#define endloop x7 +#define tmp1 x8 +#define tmp2 x9 +#define pos x11 +#define limit_wd x12 +#define mask x13 + + .p2align 6 +ASM_GLOBAL ASM_PFX(InternalMemCompareMem) +ASM_PFX(InternalMemCompareMem): + eor tmp1, src1, src2 + tst tmp1, #7 + b.ne .Lmisaligned8 + ands tmp1, src1, #7 + b.ne .Lmutual_align + add limit_wd, limit, #7 + lsr limit_wd, limit_wd, #3 + + // Start of performance-critical section -- one 64B cache line. +.Lloop_aligned: + ldr data1, [src1], #8 + ldr data2, [src2], #8 +.Lstart_realigned: + subs limit_wd, limit_wd, #1 + eor diff, data1, data2 // Non-zero if differences found. + csinv endloop, diff, xzr, ne // Last Dword or differences. + cbz endloop, .Lloop_aligned + // End of performance-critical section -- one 64B cache line. + + // Not reached the limit, must have found a diff. + cbnz limit_wd, .Lnot_limit + + // Limit % 8 == 0 => all bytes significant. + ands limit, limit, #7 + b.eq .Lnot_limit + + lsl limit, limit, #3 // Bits -> bytes. + mov mask, #~0 + lsl mask, mask, limit + bic data1, data1, mask + bic data2, data2, mask + + orr diff, diff, mask + +.Lnot_limit: + rev diff, diff + rev data1, data1 + rev data2, data2 + + // The MS-non-zero bit of DIFF marks either the first bit + // that is different, or the end of the significant data. + // Shifting left now will bring the critical information into the + // top bits. + clz pos, diff + lsl data1, data1, pos + lsl data2, data2, pos + + // But we need to zero-extend (char is unsigned) the value and then + // perform a signed 32-bit subtraction. + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret + +.Lmutual_align: + // Sources are mutually aligned, but are not currently at an + // alignment boundary. Round down the addresses and then mask off + // the bytes that precede the start point. + bic src1, src1, #7 + bic src2, src2, #7 + add limit, limit, tmp1 // Adjust the limit for the extra. + lsl tmp1, tmp1, #3 // Bytes beyond alignment -> bits. + ldr data1, [src1], #8 + neg tmp1, tmp1 // Bits to alignment -64. + ldr data2, [src2], #8 + mov tmp2, #~0 + + // Little-endian. Early bytes are at LSB. + lsr tmp2, tmp2, tmp1 // Shift (tmp1 & 63). + add limit_wd, limit, #7 + orr data1, data1, tmp2 + orr data2, data2, tmp2 + lsr limit_wd, limit_wd, #3 + b .Lstart_realigned + + .p2align 6 +.Lmisaligned8: + sub limit, limit, #1 +1: + // Perhaps we can do better than this. + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs limit, limit, #1 + ccmp data1w, data2w, #0, cs // NZCV = 0b0000. + b.eq 1b + sub result, data1, data2 + ret diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S new file mode 100644 index 000000000000..10b55b065c47 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S @@ -0,0 +1,284 @@ +// +// Copyright (c) 2012 - 2016, Linaro Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of the Linaro nor the +// names of its contributors may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +// +// Copyright (c) 2015 ARM Ltd +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// 3. The name of the company may not be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +// Assumptions: +// +// ARMv8-a, AArch64, unaligned accesses. +// +// + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define A_hw w7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l x14 +#define E_h x15 +#define F_l srcend +#define F_h dst +#define tmp1 x9 +#define tmp2 x3 + +#define L(l) .L ## l + +// Copies are split into 3 main cases: small copies of up to 16 bytes, +// medium copies of 17..96 bytes which are fully unrolled. Large copies +// of more than 96 bytes align the destination and use an unrolled loop +// processing 64 bytes per iteration. +// Small and medium copies read all data before writing, allowing any +// kind of overlap, and memmove tailcalls memcpy for these cases as +// well as non-overlapping copies. + +__memcpy: + prfm PLDL1KEEP, [src] + add srcend, src, count + add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) + cmp count, 96 + b.hi L(copy_long) + + // Medium copies: 17..96 bytes. + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] +1: + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + // Small copies: 0..16 bytes. +L(copy16): + cmp count, 8 + b.lo 1f + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + .p2align 4 +1: + tbz count, 2, 1f + ldr A_lw, [src] + ldr A_hw, [srcend, -4] + str A_lw, [dstin] + str A_hw, [dstend, -4] + ret + + // Copy 0..3 bytes. Use a branchless sequence that copies the same + // byte 3 times if count==1, or the 2nd byte twice if count==2. +1: + cbz count, 2f + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] +2: ret + + .p2align 4 + // Copy 64..96 bytes. Copy 64 bytes from the start and + // 32 bytes from the end. +L(copy96): + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [src, 32] + ldp D_l, D_h, [src, 48] + ldp E_l, E_h, [srcend, -32] + ldp F_l, F_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin, 32] + stp D_l, D_h, [dstin, 48] + stp E_l, E_h, [dstend, -32] + stp F_l, F_h, [dstend, -16] + ret + + // Align DST to 16 byte alignment so that we don't cross cache line + // boundaries on both loads and stores. There are at least 96 bytes + // to copy, so copy 16 bytes unaligned and then align. The loop + // copies 64 bytes per iteration and prefetches one iteration ahead. + + .p2align 4 +L(copy_long): + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldp D_l, D_h, [src] + sub src, src, tmp1 + add count, count, tmp1 // Count is now 16 too large. + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 // Test and readjust count. + b.ls 2f +1: + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi 1b + + // Write the last full set of 64 bytes. The remainder is at most 64 + // bytes, so it is safe to always copy 64 bytes from the end even if + // there is just 1 byte left. +2: + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + +// +// All memmoves up to 96 bytes are done by memcpy as it supports overlaps. +// Larger backwards copies are also handled by memcpy. The only remaining +// case is forward large copies. The destination is aligned, and an +// unrolled loop processes 64 bytes per iteration. +// + +ASM_GLOBAL ASM_PFX(InternalMemCopyMem) +ASM_PFX(InternalMemCopyMem): + sub tmp2, dstin, src + cmp count, 96 + ccmp tmp2, count, 2, hi + b.hs __memcpy + + cbz tmp2, 3f + add dstend, dstin, count + add srcend, src, count + + // Align dstend to 16 byte alignment so that we don't cross cache line + // boundaries on both loads and stores. There are at least 96 bytes + // to copy, so copy 16 bytes unaligned and then align. The loop + // copies 64 bytes per iteration and prefetches one iteration ahead. + + and tmp2, dstend, 15 + ldp D_l, D_h, [srcend, -16] + sub srcend, srcend, tmp2 + sub count, count, tmp2 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp2 + subs count, count, 128 + b.ls 2f + nop +1: + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi 1b + + // Write the last full set of 64 bytes. The remainder is at most 64 + // bytes, so it is safe to always copy 64 bytes from the start even if + // there is just 1 byte left. +2: + ldp E_l, E_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp E_l, E_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] +3: ret diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S new file mode 100644 index 000000000000..e9029546d762 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S @@ -0,0 +1,161 @@ +// +// Copyright (c) 2014, ARM Limited +// All rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of the company nor the names of its contributors +// may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +// Assumptions: +// +// ARMv8-a, AArch64 +// Neon Available. +// + +// Arguments and results. +#define srcin x0 +#define cntin x1 +#define chrin w2 + +#define result x0 + +#define src x3 +#define tmp x4 +#define wtmp2 w5 +#define synd x6 +#define soff x9 +#define cntrem x10 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_chr1 v3 +#define vhas_chr2 v4 +#define vrepmask v5 +#define vend v6 + +// +// Core algorithm: +// +// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits +// per byte. For each tuple, bit 0 is set if the relevant byte matched the +// requested character and bit 1 is not used (faster than using a 32bit +// syndrome). Since the bits in the syndrome reflect exactly the order in which +// things occur in the original string, counting trailing zeros allows to +// identify exactly which byte has matched. +// + +ASM_GLOBAL ASM_PFX(InternalMemScanMem8) +ASM_PFX(InternalMemScanMem8): + // Do not dereference srcin if no bytes to compare. + cbz cntin, .Lzero_length + // + // Magic constant 0x40100401 allows us to identify which lane matches + // the requested byte. + // + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + // Work with aligned 32-byte chunks + bic src, srcin, #31 + dup vrepmask.4s, wtmp2 + ands soff, srcin, #31 + and cntrem, cntin, #31 + b.eq .Lloop + + // + // Input string is not 32-byte aligned. We calculate the syndrome + // value for the aligned 32 bytes block containing the first bytes + // and mask the irrelevant part. + // + + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + sub tmp, soff, #32 + adds cntin, cntin, tmp + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vend.16b, vend.16b, vend.16b // 128->64 + mov synd, vend.2d[0] + // Clear the soff*2 lower bits + lsl tmp, soff, #1 + lsr synd, synd, tmp + lsl synd, synd, tmp + // The first block can also be the last + b.ls .Lmasklast + // Have we found something already? + cbnz synd, .Ltail + +.Lloop: + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + subs cntin, cntin, #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + // If we're out of data we finish regardless of the result + b.ls .Lend + // Use a fast check for the termination condition + orr vend.16b, vhas_chr1.16b, vhas_chr2.16b + addp vend.2d, vend.2d, vend.2d + mov synd, vend.2d[0] + // We're not out of data, loop if we haven't found the character + cbz synd, .Lloop + +.Lend: + // Termination condition found, let's calculate the syndrome value + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vend.16b, vend.16b, vend.16b // 128->64 + mov synd, vend.2d[0] + // Only do the clear for the last possible block + b.hi .Ltail + +.Lmasklast: + // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits + add tmp, cntrem, soff + and tmp, tmp, #31 + sub tmp, tmp, #32 + neg tmp, tmp, lsl #1 + lsl synd, synd, tmp + lsr synd, synd, tmp + +.Ltail: + // Count the trailing zeros using bit reversing + rbit synd, synd + // Compensate the last post-increment + sub src, src, #32 + // Check that we have found a character + cmp synd, #0 + // And count the leading zeros + clz synd, synd + // Compute the potential result + add result, src, synd, lsr #1 + // Select result or NULL + csel result, xzr, result, eq + ret + +.Lzero_length: + mov result, #0 + ret diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S new file mode 100644 index 000000000000..7f361110d4fe --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S @@ -0,0 +1,244 @@ +// +// Copyright (c) 2012 - 2016, Linaro Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of the Linaro nor the +// names of its contributors may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +// +// Copyright (c) 2015 ARM Ltd +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// 3. The name of the company may not be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +// Assumptions: +// +// ARMv8-a, AArch64, unaligned accesses +// +// + +#define dstin x0 +#define count x1 +#define val x2 +#define valw w2 +#define dst x3 +#define dstend x4 +#define tmp1 x5 +#define tmp1w w5 +#define tmp2 x6 +#define tmp2w w6 +#define zva_len x7 +#define zva_lenw w7 + +#define L(l) .L ## l + +ASM_GLOBAL ASM_PFX(InternalMemSetMem16) +ASM_PFX(InternalMemSetMem16): + dup v0.8H, valw + b 0f + +ASM_GLOBAL ASM_PFX(InternalMemSetMem32) +ASM_PFX(InternalMemSetMem32): + dup v0.4S, valw + b 0f + +ASM_GLOBAL ASM_PFX(InternalMemSetMem64) +ASM_PFX(InternalMemSetMem64): + dup v0.2D, val + b 0f + +ASM_GLOBAL ASM_PFX(InternalMemZeroMem) +ASM_PFX(InternalMemZeroMem): + movi v0.16B, #0 + b 0f + +ASM_GLOBAL ASM_PFX(InternalMemSetMem) +ASM_PFX(InternalMemSetMem): + dup v0.16B, valw +0: add dstend, dstin, count + mov val, v0.D[0] + + cmp count, 96 + b.hi L(set_long) + cmp count, 16 + b.hs L(set_medium) + + // Set 0..15 bytes. + tbz count, 3, 1f + str val, [dstin] + str val, [dstend, -8] + ret + nop +1: tbz count, 2, 2f + str valw, [dstin] + str valw, [dstend, -4] + ret +2: cbz count, 3f + strb valw, [dstin] + tbz count, 1, 3f + strh valw, [dstend, -2] +3: ret + + // Set 17..96 bytes. +L(set_medium): + str q0, [dstin] + tbnz count, 6, L(set96) + str q0, [dstend, -16] + tbz count, 5, 1f + str q0, [dstin, 16] + str q0, [dstend, -32] +1: ret + + .p2align 4 + // Set 64..96 bytes. Write 64 bytes from the start and + // 32 bytes from the end. +L(set96): + str q0, [dstin, 16] + stp q0, q0, [dstin, 32] + stp q0, q0, [dstend, -32] + ret + + .p2align 3 + nop +L(set_long): + bic dst, dstin, 15 + str q0, [dstin] + cmp count, 256 + ccmp val, 0, 0, cs + b.eq L(try_zva) +L(no_zva): + sub count, dstend, dst // Count is 16 too large. + add dst, dst, 16 + sub count, count, 64 + 16 // Adjust count and bias for loop. +1: stp q0, q0, [dst], 64 + stp q0, q0, [dst, -32] +L(tail64): + subs count, count, 64 + b.hi 1b +2: stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + + .p2align 3 +L(try_zva): + mrs tmp1, dczid_el0 + tbnz tmp1w, 4, L(no_zva) + and tmp1w, tmp1w, 15 + cmp tmp1w, 4 // ZVA size is 64 bytes. + b.ne L(zva_128) + + // Write the first and last 64 byte aligned block using stp rather + // than using DC ZVA. This is faster on some cores. +L(zva_64): + str q0, [dst, 16] + stp q0, q0, [dst, 32] + bic dst, dst, 63 + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + sub count, dstend, dst // Count is now 128 too large. + sub count, count, 128+64+64 // Adjust count and bias for loop. + add dst, dst, 128 + nop +1: dc zva, dst + add dst, dst, 64 + subs count, count, 64 + b.hi 1b + stp q0, q0, [dst, 0] + stp q0, q0, [dst, 32] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + + .p2align 3 +L(zva_128): + cmp tmp1w, 5 // ZVA size is 128 bytes. + b.ne L(zva_other) + + str q0, [dst, 16] + stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + bic dst, dst, 127 + sub count, dstend, dst // Count is now 128 too large. + sub count, count, 128+128 // Adjust count and bias for loop. + add dst, dst, 128 +1: dc zva, dst + add dst, dst, 128 + subs count, count, 128 + b.hi 1b + stp q0, q0, [dstend, -128] + stp q0, q0, [dstend, -96] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +L(zva_other): + mov tmp2w, 4 + lsl zva_lenw, tmp2w, tmp1w + add tmp1, zva_len, 64 // Max alignment bytes written. + cmp count, tmp1 + blo L(no_zva) + + sub tmp2, zva_len, 1 + add tmp1, dst, zva_len + add dst, dst, 16 + subs count, tmp1, dst // Actual alignment bytes to write. + bic tmp1, tmp1, tmp2 // Aligned dc zva start address. + beq 2f +1: stp q0, q0, [dst], 64 + stp q0, q0, [dst, -32] + subs count, count, 64 + b.hi 1b +2: mov dst, tmp1 + sub count, dstend, tmp1 // Remaining bytes to write. + subs count, count, zva_len + b.lo 4f +3: dc zva, dst + add dst, dst, zva_len + subs count, count, zva_len + b.hs 3b +4: add count, count, zva_len + b L(tail64) diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf index d95eb599ea9e..64d11b09ef06 100644 --- a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf +++ b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf @@ -27,7 +27,7 @@ [Defines] # -# VALID_ARCHITECTURES = IA32 X64 ARM +# VALID_ARCHITECTURES = IA32 X64 ARM AARCH64 # [Sources] @@ -127,6 +127,13 @@ [Sources.ARM] Arm/CopyMem.asm |RVCT Arm/CompareMem.asm |RVCT +[Sources.AARCH64] + AArch64/ScanMem.S + AArch64/SetMem.S + AArch64/CopyMem.S + AArch64/CompareMem.S + +[Sources.ARM, Sources.AARCH64] Arm/ScanMemGeneric.c [Sources]

[edk2,v3,3/3] MdePkg/BaseMemoryLibOptDxe: add accelerated AARCH64 routines

Commit Message

Patch