[edk2,v2,2/3] MdePkg/BaseMemoryLibOptDxe: add accelerated ARM routines

Message ID	1473171813-24595-3-git-send-email-ard.biesheuvel@linaro.org
State	Superseded
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: best guess record for domain of edk2-devel-bounces@lists.01.org designates 198.145.21.10 as permitted sender) client-ip=198.145.21.10; From: Ard Biesheuvel <ard.biesheuvel@linaro.org> To: edk2-devel@lists.01.org, leif.lindholm@linaro.org, liming.gao@intel.com Date: Tue, 6 Sep 2016 15:23:32 +0100 Message-Id: <1473171813-24595-3-git-send-email-ard.biesheuvel@linaro.org> In-Reply-To: <1473171813-24595-1-git-send-email-ard.biesheuvel@linaro.org> References: <1473171813-24595-1-git-send-email-ard.biesheuvel@linaro.org> Subject: [edk2] [PATCH v2 2/3] MdePkg/BaseMemoryLibOptDxe: add accelerated ARM routines Precedence: list Cc: michael.d.kinney@intel.com, Ard Biesheuvel <ard.biesheuvel@linaro.org> MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: edk2-devel-bounces@lists.01.org Sender: "edk2-devel" <edk2-devel-bounces@lists.01.org>

diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S new file mode 100644 index 000000000000..951d15777a38 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S @@ -0,0 +1,138 @@ +// +// Copyright (c) 2013 - 2016, Linaro Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of the Linaro nor the +// names of its contributors may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +// Parameters and result. +#define src1 r0 +#define src2 r1 +#define limit r2 +#define result r0 + +// Internal variables. +#define data1 r3 +#define data2 r4 +#define limit_wd r5 +#define diff r6 +#define tmp1 r7 +#define tmp2 r12 +#define pos r8 +#define mask r14 + + .text + .thumb + .syntax unified + .align 5 +ASM_GLOBAL ASM_PFX(InternalMemCompareMem) +ASM_PFX(InternalMemCompareMem): + push {r4-r8, lr} + eor tmp1, src1, src2 + tst tmp1, #3 + bne .Lmisaligned4 + ands tmp1, src1, #3 + bne .Lmutual_align + add limit_wd, limit, #3 + nop.w + lsr limit_wd, limit_wd, #2 + + // Start of performance-critical section -- one 32B cache line. +.Lloop_aligned: + ldr data1, [src1], #4 + ldr data2, [src2], #4 +.Lstart_realigned: + subs limit_wd, limit_wd, #1 + eor diff, data1, data2 // Non-zero if differences found. + cbnz diff, 0f + bne .Lloop_aligned + // End of performance-critical section -- one 32B cache line. + + // Not reached the limit, must have found a diff. +0: cbnz limit_wd, .Lnot_limit + + // Limit % 4 == 0 => all bytes significant. + ands limit, limit, #3 + beq .Lnot_limit + + lsl limit, limit, #3 // Bits -> bytes. + mov mask, #~0 + lsl mask, mask, limit + bic data1, data1, mask + bic data2, data2, mask + + orr diff, diff, mask + +.Lnot_limit: + rev diff, diff + rev data1, data1 + rev data2, data2 + + // The MS-non-zero bit of DIFF marks either the first bit + // that is different, or the end of the significant data. + // Shifting left now will bring the critical information into the + // top bits. + clz pos, diff + lsl data1, data1, pos + lsl data2, data2, pos + + // But we need to zero-extend (char is unsigned) the value and then + // perform a signed 32-bit subtraction. + lsr data1, data1, #28 + sub result, data1, data2, lsr #28 + pop {r4-r8, pc} + +.Lmutual_align: + // Sources are mutually aligned, but are not currently at an + // alignment boundary. Round down the addresses and then mask off + // the bytes that precede the start point. + bic src1, src1, #3 + bic src2, src2, #3 + add limit, limit, tmp1 // Adjust the limit for the extra. + lsl tmp1, tmp1, #2 // Bytes beyond alignment -> bits. + ldr data1, [src1], #4 + neg tmp1, tmp1 // Bits to alignment -32. + ldr data2, [src2], #4 + mov tmp2, #~0 + + // Little-endian. Early bytes are at LSB. + lsr tmp2, tmp2, tmp1 // Shift (tmp1 & 31). + add limit_wd, limit, #3 + orr data1, data1, tmp2 + orr data2, data2, tmp2 + lsr limit_wd, limit_wd, #2 + b .Lstart_realigned + +.Lmisaligned4: + sub limit, limit, #1 +1: + // Perhaps we can do better than this. + ldrb data1, [src1], #1 + ldrb data2, [src2], #1 + subs limit, limit, #1 + it cs + cmpcs data1, data2 + beq 1b + sub result, data1, data2 + pop {r4-r8, pc} diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm new file mode 100644 index 000000000000..47b49ee16473 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm @@ -0,0 +1,140 @@ +; +; Copyright (c) 2013 - 2016, Linaro Limited +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the distribution. +; * Neither the name of the Linaro nor the +; names of its contributors may be used to endorse or promote products +; derived from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +; + +; Parameters and result. +#define src1 r0 +#define src2 r1 +#define limit r2 +#define result r0 + +; Internal variables. +#define data1 r3 +#define data2 r4 +#define limit_wd r5 +#define diff r6 +#define tmp1 r7 +#define tmp2 r12 +#define pos r8 +#define mask r14 + + EXPORT InternalMemCompareMem + THUMB + AREA CompareMem, CODE, READONLY + +InternalMemCompareMem + push {r4-r8, lr} + eor tmp1, src1, src2 + tst tmp1, #3 + bne Lmisaligned4 + ands tmp1, src1, #3 + bne Lmutual_align + add limit_wd, limit, #3 + nop.w + lsr limit_wd, limit_wd, #2 + + ; Start of performance-critical section -- one 32B cache line. +Lloop_aligned + ldr data1, [src1], #4 + ldr data2, [src2], #4 +Lstart_realigned + subs limit_wd, limit_wd, #1 + eor diff, data1, data2 ; Non-zero if differences found. + cbnz diff, L0 + bne Lloop_aligned + ; End of performance-critical section -- one 32B cache line. + + ; Not reached the limit, must have found a diff. +L0 + cbnz limit_wd, Lnot_limit + + // Limit % 4 == 0 => all bytes significant. + ands limit, limit, #3 + beq Lnot_limit + + lsl limit, limit, #3 // Bits -> bytes. + mov mask, #~0 + lsl mask, mask, limit + bic data1, data1, mask + bic data2, data2, mask + + orr diff, diff, mask + +Lnot_limit + rev diff, diff + rev data1, data1 + rev data2, data2 + + ; The MS-non-zero bit of DIFF marks either the first bit + ; that is different, or the end of the significant data. + ; Shifting left now will bring the critical information into the + ; top bits. + clz pos, diff + lsl data1, data1, pos + lsl data2, data2, pos + + ; But we need to zero-extend (char is unsigned) the value and then + ; perform a signed 32-bit subtraction. + lsr data1, data1, #28 + sub result, data1, data2, lsr #28 + pop {r4-r8, pc} + +Lmutual_align + ; Sources are mutually aligned, but are not currently at an + ; alignment boundary. Round down the addresses and then mask off + ; the bytes that precede the start point. + bic src1, src1, #3 + bic src2, src2, #3 + add limit, limit, tmp1 ; Adjust the limit for the extra. + lsl tmp1, tmp1, #2 ; Bytes beyond alignment -> bits. + ldr data1, [src1], #4 + neg tmp1, tmp1 ; Bits to alignment -32. + ldr data2, [src2], #4 + mov tmp2, #~0 + + ; Little-endian. Early bytes are at LSB. + lsr tmp2, tmp2, tmp1 ; Shift (tmp1 & 31). + add limit_wd, limit, #3 + orr data1, data1, tmp2 + orr data2, data2, tmp2 + lsr limit_wd, limit_wd, #2 + b Lstart_realigned + +Lmisaligned4 + sub limit, limit, #1 +L1 + // Perhaps we can do better than this. + ldrb data1, [src1], #1 + ldrb data2, [src2], #1 + subs limit, limit, #1 + it cs + cmpcs data1, data2 + beq L1 + sub result, data1, data2 + pop {r4-r8, pc} + + END diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S new file mode 100644 index 000000000000..fb5293befc10 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S @@ -0,0 +1,172 @@ +#------------------------------------------------------------------------------ +# +# CopyMem() worker for ARM +# +# This file started out as C code that did 64 bit moves if the buffer was +# 32-bit aligned, else it does a byte copy. It also does a byte copy for +# any trailing bytes. It was updated to do 32-byte copies using stm/ldm. +# +# Copyright (c) 2008 - 2010, Apple Inc. All rights reserved. +# Copyright (c) 2016, Linaro Ltd. All rights reserved. +# This program and the accompanying materials +# are licensed and made available under the terms and conditions of the BSD License +# which accompanies this distribution. The full text of the license may be found at +# http://opensource.org/licenses/bsd-license.php +# +# THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. +# +#------------------------------------------------------------------------------ + + .text + .thumb + .syntax unified + +/** + Copy Length bytes from Source to Destination. Overlap is OK. + + This implementation + + @param Destination Target of copy + @param Source Place to copy from + @param Length Number of bytes to copy + + @return Destination + + +VOID * +EFIAPI +InternalMemCopyMem ( + OUT VOID *DestinationBuffer, + IN CONST VOID *SourceBuffer, + IN UINTN Length + ) +**/ +ASM_GLOBAL ASM_PFX(InternalMemCopyMem) +ASM_PFX(InternalMemCopyMem): + push {r4-r11, lr} + // Save the input parameters in extra registers (r11 = destination, r14 = source, r12 = length) + mov r11, r0 + mov r10, r0 + mov r12, r2 + mov r14, r1 + + cmp r11, r1 + // If (dest < source) + bcc memcopy_check_optim_default + + // If (source + length < dest) + rsb r3, r1, r11 + cmp r12, r3 + bcc memcopy_check_optim_default + b memcopy_check_optim_overlap + +memcopy_check_optim_default: + // Check if we can use an optimized path ((length >= 32) && destination word-aligned && source word-aligned) for the memcopy (optimized path if r0 == 1) + tst r0, #0xF + it ne + movne r0, #0 + bne memcopy_default + tst r1, #0xF + ite ne + movne r3, #0 + moveq r3, #1 + cmp r2, #31 + ite ls + movls r0, #0 + andhi r0, r3, #1 + b memcopy_default + +memcopy_check_optim_overlap: + // r10 = dest_end, r14 = source_end + add r10, r11, r12 + add r14, r12, r1 + + // Are we in the optimized case ((length >= 32) && dest_end word-aligned && source_end word-aligned) + cmp r2, #31 + ite ls + movls r0, #0 + movhi r0, #1 + tst r10, #0xF + it ne + movne r0, #0 + tst r14, #0xF + it ne + movne r0, #0 + b memcopy_overlapped + +memcopy_overlapped_non_optim: + // We read 1 byte from the end of the source buffer + sub r3, r14, #1 + sub r12, r12, #1 + ldrb r3, [r3, #0] + sub r2, r10, #1 + cmp r12, #0 + // We write 1 byte at the end of the dest buffer + sub r10, r10, #1 + sub r14, r14, #1 + strb r3, [r2, #0] + bne memcopy_overlapped_non_optim + b memcopy_end + +// r10 = dest_end, r14 = source_end +memcopy_overlapped: + // Are we in the optimized case ? + cmp r0, #0 + beq memcopy_overlapped_non_optim + + // Optimized Overlapped - Read 32 bytes + sub r14, r14, #32 + sub r12, r12, #32 + cmp r12, #31 + ldmia r14, {r2-r9} + + // If length is less than 32 then disable optim + it ls + movls r0, #0 + + cmp r12, #0 + + // Optimized Overlapped - Write 32 bytes + sub r10, r10, #32 + stmia r10, {r2-r9} + + // while (length != 0) + bne memcopy_overlapped + b memcopy_end + +memcopy_default_non_optim: + // Byte copy + ldrb r3, [r14], #1 + sub r12, r12, #1 + strb r3, [r10], #1 + +memcopy_default: + cmp r12, #0 + beq memcopy_end + +// r10 = dest, r14 = source +memcopy_default_loop: + cmp r0, #0 + beq memcopy_default_non_optim + + // Optimized memcopy - Read 32 Bytes + sub r12, r12, #32 + cmp r12, #31 + ldmia r14!, {r2-r9} + + // If length is less than 32 then disable optim + it ls + movls r0, #0 + + cmp r12, #0 + + // Optimized memcopy - Write 32 Bytes + stmia r10!, {r2-r9} + + // while (length != 0) + bne memcopy_default_loop + +memcopy_end: + mov r0, r11 + pop {r4-r11, pc} diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm new file mode 100644 index 000000000000..2034807954d7 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm @@ -0,0 +1,147 @@ +;------------------------------------------------------------------------------ +; +; CopyMem() worker for ARM +; +; This file started out as C code that did 64 bit moves if the buffer was +; 32-bit aligned, else it does a byte copy. It also does a byte copy for +; any trailing bytes. It was updated to do 32-byte copies using stm/ldm. +; +; Copyright (c) 2008 - 2010, Apple Inc. All rights reserved. +; Copyright (c) 2016, Linaro Ltd. All rights reserved. +; This program and the accompanying materials +; are licensed and made available under the terms and conditions of the BSD License +; which accompanies this distribution. The full text of the license may be found at +; http://opensource.org/licenses/bsd-license.php +; +; THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS, +; WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. +; +;------------------------------------------------------------------------------ + + EXPORT InternalMemCopyMem + AREA SetMem, CODE, READONLY + THUMB + +InternalMemCopyMem + stmfd sp!, {r4-r11, lr} + // Save the input parameters in extra registers (r11 = destination, r14 = source, r12 = length) + mov r11, r0 + mov r10, r0 + mov r12, r2 + mov r14, r1 + +memcopy_check_overlapped + cmp r11, r1 + // If (dest < source) + bcc memcopy_check_optim_default + + // If (source + length < dest) + rsb r3, r1, r11 + cmp r12, r3 + bcc memcopy_check_optim_default + b memcopy_check_optim_overlap + +memcopy_check_optim_default + // Check if we can use an optimized path ((length >= 32) && destination word-aligned && source word-aligned) for the memcopy (optimized path if r0 == 1) + tst r0, #0xF + movne r0, #0 + bne memcopy_default + tst r1, #0xF + movne r3, #0 + moveq r3, #1 + cmp r2, #31 + movls r0, #0 + andhi r0, r3, #1 + b memcopy_default + +memcopy_check_optim_overlap + // r10 = dest_end, r14 = source_end + add r10, r11, r12 + add r14, r12, r1 + + // Are we in the optimized case ((length >= 32) && dest_end word-aligned && source_end word-aligned) + cmp r2, #31 + movls r0, #0 + movhi r0, #1 + tst r10, #0xF + movne r0, #0 + tst r14, #0xF + movne r0, #0 + b memcopy_overlapped + +memcopy_overlapped_non_optim + // We read 1 byte from the end of the source buffer + sub r3, r14, #1 + sub r12, r12, #1 + ldrb r3, [r3, #0] + sub r2, r10, #1 + cmp r12, #0 + // We write 1 byte at the end of the dest buffer + sub r10, r10, #1 + sub r14, r14, #1 + strb r3, [r2, #0] + bne memcopy_overlapped_non_optim + b memcopy_end + +// r10 = dest_end, r14 = source_end +memcopy_overlapped + // Are we in the optimized case ? + cmp r0, #0 + beq memcopy_overlapped_non_optim + + // Optimized Overlapped - Read 32 bytes + sub r14, r14, #32 + sub r12, r12, #32 + cmp r12, #31 + ldmia r14, {r2-r9} + + // If length is less than 32 then disable optim + movls r0, #0 + + cmp r12, #0 + + // Optimized Overlapped - Write 32 bytes + sub r10, r10, #32 + stmia r10, {r2-r9} + + // while (length != 0) + bne memcopy_overlapped + b memcopy_end + +memcopy_default_non_optim + // Byte copy + ldrb r3, [r14], #1 + sub r12, r12, #1 + strb r3, [r10], #1 + +memcopy_default + cmp r12, #0 + beq memcopy_end + +// r10 = dest, r14 = source +memcopy_default_loop + cmp r0, #0 + beq memcopy_default_non_optim + + // Optimized memcopy - Read 32 Bytes + sub r12, r12, #32 + cmp r12, #31 + ldmia r14!, {r2-r9} + + // If length is less than 32 then disable optim + movls r0, #0 + + cmp r12, #0 + + // Optimized memcopy - Write 32 Bytes + stmia r10!, {r2-r9} + + // while (length != 0) + bne memcopy_default_loop + +memcopy_end + mov r0, r11 + ldmfd sp!, {r4-r11, pc} + + END + diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S new file mode 100644 index 000000000000..c4174e90949b --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S @@ -0,0 +1,146 @@ +// Copyright (c) 2010-2011, Linaro Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the name of Linaro Limited nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +// +// Written by Dave Gilbert <david.gilbert@linaro.org> +// +// This memchr routine is optimised on a Cortex-A9 and should work on +// all ARMv7 processors. It has a fast past for short sizes, and has +// an optimised path for large data sets; the worst case is finding the +// match early in a large data set. +// + + +// 2011-02-07 david.gilbert@linaro.org +// Extracted from local git a5b438d861 +// 2011-07-14 david.gilbert@linaro.org +// Import endianness fix from local git ea786f1b +// 2011-12-07 david.gilbert@linaro.org +// Removed unneeded cbz from align loop + +// this lets us check a flag in a 00/ff byte easily in either endianness +#define CHARTSTMASK(c) 1<<(c*8) + + .text + .thumb + .syntax unified + + .type ASM_PFX(InternalMemScanMem8), %function +ASM_GLOBAL ASM_PFX(InternalMemScanMem8) +ASM_PFX(InternalMemScanMem8): + // r0 = start of memory to scan + // r1 = length + // r2 = character to look for + // returns r0 = pointer to character or NULL if not found + uxtb r2, r2 // Don't think we can trust the caller to actually pass a char + + cmp r1, #16 // If it's short don't bother with anything clever + blt 20f + + tst r0, #7 // If it's already aligned skip the next bit + beq 10f + + // Work up to an aligned point +5: + ldrb r3, [r0],#1 + subs r1, r1, #1 + cmp r3, r2 + beq 50f // If it matches exit found + tst r0, #7 + bne 5b // If not aligned yet then do next byte + +10: + // At this point, we are aligned, we know we have at least 8 bytes to work with + push {r4-r7} + orr r2, r2, r2, lsl #8 // expand the match word across to all bytes + orr r2, r2, r2, lsl #16 + bic r4, r1, #7 // Number of double words to work with + mvns r7, #0 // all F's + movs r3, #0 + +15: + ldmia r0!, {r5,r6} + subs r4, r4, #8 + eor r5, r5, r2 // Get it so that r5,r6 have 00's where the bytes match the target + eor r6, r6, r2 + uadd8 r5, r5, r7 // Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r5, r3, r7 // bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + uadd8 r6, r6, r7 // Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r6, r5, r7 // chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + cbnz r6, 60f + bne 15b // (Flags from the subs above) If not run out of bytes then go around again + + pop {r4-r7} + and r2, r2, #0xff // Get r2 back to a single character from the expansion above + and r1, r1, #7 // Leave the count remaining as the number after the double words have been done + +20: + cbz r1, 40f // 0 length or hit the end already then not found + +21: // Post aligned section, or just a short call + ldrb r3, [r0], #1 + subs r1, r1, #1 + eor r3, r3, r2 // r3 = 0 if match - doesn't break flags from sub + cbz r3, 50f + bne 21b // on r1 flags + +40: + movs r0, #0 // not found + bx lr + +50: + subs r0, r0, #1 // found + bx lr + +60: // We're here because the fast path found a hit - now we have to track down exactly which word it was + // r0 points to the start of the double word after the one that was tested + // r5 has the 00/ff pattern for the first word, r6 has the chained value + cmp r5, #0 + itte eq + moveq r5, r6 // the end is in the 2nd word + subeq r0, r0, #3 // Points to 2nd byte of 2nd word + subne r0, r0, #7 // or 2nd byte of 1st word + + // r0 currently points to the 3rd byte of the word containing the hit + tst r5, #CHARTSTMASK(0) // 1st character + bne 61f + adds r0, r0, #1 + tst r5, #CHARTSTMASK(1) // 2nd character + ittt eq + addeq r0, r0 ,#1 + tsteq r5, #(3 << 15) // 2nd & 3rd character + // If not the 3rd must be the last one + addeq r0, r0, #1 + +61: + pop {r4-r7} + subs r0, r0, #1 + bx lr diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm new file mode 100644 index 000000000000..462817ba3310 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm @@ -0,0 +1,147 @@ +; Copyright (c) 2010-2011, Linaro Limited +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the distribution. +; +; * Neither the name of Linaro Limited nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +; + +; +; Written by Dave Gilbert <david.gilbert@linaro.org> +; +; This memchr routine is optimised on a Cortex-A9 and should work on +; all ARMv7 processors. It has a fast past for short sizes, and has +; an optimised path for large data sets; the worst case is finding the +; match early in a large data set. +; + + +; 2011-02-07 david.gilbert@linaro.org +; Extracted from local git a5b438d861 +; 2011-07-14 david.gilbert@linaro.org +; Import endianness fix from local git ea786f1b +; 2011-12-07 david.gilbert@linaro.org +; Removed unneeded cbz from align loop + +; this lets us check a flag in a 00/ff byte easily in either endianness +#define CHARTSTMASK(c) 1<<(c*8) + + EXPORT InternalMemScanMem8 + AREA ScanMem, CODE, READONLY + THUMB + +InternalMemScanMem8 + ; r0 = start of memory to scan + ; r1 = length + ; r2 = character to look for + ; returns r0 = pointer to character or NULL if not found + and r2, r2, #0xff ; Don't think we can trust the caller to actually pass a char + + cmp r1, #16 ; If it's short don't bother with anything clever + blt L20 + + tst r0, #7 ; If it's already aligned skip the next bit + beq L10 + + ; Work up to an aligned point +L5 + ldrb r3, [r0],#1 + subs r1, r1, #1 + cmp r3, r2 + beq L50 ; If it matches exit found + tst r0, #7 + bne L5 ; If not aligned yet then do next byte + +L10 + ; At this point, we are aligned, we know we have at least 8 bytes to work with + push {r4-r7} + orr r2, r2, r2, lsl #8 ; expand the match word across to all bytes + orr r2, r2, r2, lsl #16 + bic r4, r1, #7 ; Number of double words to work with + mvns r7, #0 ; all F's + movs r3, #0 + +L15 + ldmia r0!, {r5,r6} + subs r4, r4, #8 + eor r5, r5, r2 ; Get it so that r5,r6 have 00's where the bytes match the target + eor r6, r6, r2 + uadd8 r5, r5, r7 ; Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r5, r3, r7 ; bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + uadd8 r6, r6, r7 ; Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r6, r5, r7 ; chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + cbnz r6, L60 + bne L15 ; (Flags from the subs above) If not run out of bytes then go around again + + pop {r4-r7} + and r2, r2, #0xff ; Get r2 back to a single character from the expansion above + and r1, r1, #7 ; Leave the count remaining as the number after the double words have been done + +L20 + cbz r1, L40 ; 0 length or hit the end already then not found + +L21 ; Post aligned section, or just a short call + ldrb r3, [r0], #1 + subs r1, r1, #1 + eor r3, r3, r2 ; r3 = 0 if match - doesn't break flags from sub + cbz r3, L50 + bne L21 ; on r1 flags + +L40 + movs r0, #0 ; not found + bx lr + +L50 + subs r0, r0, #1 ; found + bx lr + +L60 ; We're here because the fast path found a hit - now we have to track down exactly which word it was + ; r0 points to the start of the double word after the one that was tested + ; r5 has the 00/ff pattern for the first word, r6 has the chained value + cmp r5, #0 + itte eq + moveq r5, r6 ; the end is in the 2nd word + subeq r0, r0, #3 ; Points to 2nd byte of 2nd word + subne r0, r0, #7 ; or 2nd byte of 1st word + + ; r0 currently points to the 3rd byte of the word containing the hit + tst r5, #CHARTSTMASK(0) ; 1st character + bne L61 + adds r0, r0, #1 + tst r5, #CHARTSTMASK(1) ; 2nd character + ittt eq + addeq r0, r0 ,#1 + tsteq r5, #(3 << 15) ; 2nd & 3rd character + ; If not the 3rd must be the last one + addeq r0, r0, #1 + +L61 + pop {r4-r7} + subs r0, r0, #1 + bx lr + + END + diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c new file mode 100644 index 000000000000..20fa7e9be697 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c @@ -0,0 +1,142 @@ +/** @file + Architecture Independent Base Memory Library Implementation. + + The following BaseMemoryLib instances contain the same copy of this file: + BaseMemoryLib + PeiMemoryLib + UefiMemoryLib + + Copyright (c) 2006 - 2016, Intel Corporation. All rights reserved. + This program and the accompanying materials + are licensed and made available under the terms and conditions of the BSD License + which accompanies this distribution. The full text of the license may be found at + http://opensource.org/licenses/bsd-license.php. + + THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS, + WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. + +**/ + +#include "../MemLibInternals.h" + +/** + Scans a target buffer for a 16-bit value, and returns a pointer to the + matching 16-bit value in the target buffer. + + @param Buffer The pointer to the target buffer to scan. + @param Length The count of 16-bit value to scan. Must be non-zero. + @param Value The value to search for in the target buffer. + + @return The pointer to the first occurrence, or NULL if not found. + +**/ +CONST VOID * +EFIAPI +InternalMemScanMem16 ( + IN CONST VOID *Buffer, + IN UINTN Length, + IN UINT16 Value + ) +{ + CONST UINT16 *Pointer; + + Pointer = (CONST UINT16*)Buffer; + do { + if (*Pointer == Value) { + return Pointer; + } + ++Pointer; + } while (--Length != 0); + return NULL; +} + +/** + Scans a target buffer for a 32-bit value, and returns a pointer to the + matching 32-bit value in the target buffer. + + @param Buffer The pointer to the target buffer to scan. + @param Length The count of 32-bit value to scan. Must be non-zero. + @param Value The value to search for in the target buffer. + + @return The pointer to the first occurrence, or NULL if not found. + +**/ +CONST VOID * +EFIAPI +InternalMemScanMem32 ( + IN CONST VOID *Buffer, + IN UINTN Length, + IN UINT32 Value + ) +{ + CONST UINT32 *Pointer; + + Pointer = (CONST UINT32*)Buffer; + do { + if (*Pointer == Value) { + return Pointer; + } + ++Pointer; + } while (--Length != 0); + return NULL; +} + +/** + Scans a target buffer for a 64-bit value, and returns a pointer to the + matching 64-bit value in the target buffer. + + @param Buffer The pointer to the target buffer to scan. + @param Length The count of 64-bit value to scan. Must be non-zero. + @param Value The value to search for in the target buffer. + + @return The pointer to the first occurrence, or NULL if not found. + +**/ +CONST VOID * +EFIAPI +InternalMemScanMem64 ( + IN CONST VOID *Buffer, + IN UINTN Length, + IN UINT64 Value + ) +{ + CONST UINT64 *Pointer; + + Pointer = (CONST UINT64*)Buffer; + do { + if (*Pointer == Value) { + return Pointer; + } + ++Pointer; + } while (--Length != 0); + return NULL; +} + +/** + Checks whether the contents of a buffer are all zeros. + + @param Buffer The pointer to the buffer to be checked. + @param Length The size of the buffer (in bytes) to be checked. + + @retval TRUE Contents of the buffer are all zeros. + @retval FALSE Contents of the buffer are not all zeros. + +**/ +BOOLEAN +EFIAPI +InternalMemIsZeroBuffer ( + IN CONST VOID *Buffer, + IN UINTN Length + ) +{ + CONST UINT8 *BufferData; + UINTN Index; + + BufferData = Buffer; + for (Index = 0; Index < Length; Index++) { + if (BufferData[Index] != 0) { + return FALSE; + } + } + return TRUE; +} diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S new file mode 100644 index 000000000000..aa7ae51bf05e --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S @@ -0,0 +1,69 @@ +#------------------------------------------------------------------------------ +# +# Copyright (c) 2016, Linaro Ltd. All rights reserved. +# +# This program and the accompanying materials are licensed and made available +# under the terms and conditions of the BSD License which accompanies this +# distribution. The full text of the license may be found at +# http://opensource.org/licenses/bsd-license.php +# +# THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. +# +#------------------------------------------------------------------------------ + + .text + .thumb + .syntax unified + +ASM_GLOBAL ASM_PFX(InternalMemZeroMem) +ASM_PFX(InternalMemZeroMem): + movs r2, #0 + b 32f + +ASM_GLOBAL ASM_PFX(InternalMemSetMem) +ASM_PFX(InternalMemSetMem): + uxtb r2, r2 + orr r2, r2, r2, lsl #8 + +ASM_GLOBAL ASM_PFX(InternalMemSetMem16) +ASM_PFX(InternalMemSetMem16): + uxth r2, r2 + orr r2, r2, r2, lsl #16 + +ASM_GLOBAL ASM_PFX(InternalMemSetMem32) +ASM_PFX(InternalMemSetMem32): +32: mov r3, r2 + +ASM_GLOBAL ASM_PFX(InternalMemSetMem64) +ASM_PFX(InternalMemSetMem64): + push {r0, lr} + add ip, r0, r1 // ip := dst + length +0: adds r0, r0, #16 // advance the output pointer by 16 bytes + cmp r0, ip // past the output? + bgt 1f // break out of the loop + strd r2, r3, [r0, #-16] // store all 16 bytes + strd r2, r3, [r0, #-8] + bic r0, r0, #15 // align output pointer + b 0b // goto beginning of loop + +1: sub r1, ip, r0 // sync length with aligned output pointer + tst r1, #8 // between 8 and 15 bytes? + ittt ne + strdne r2, r3, [r0, #-16] // overlapping store of 8 + 8 bytes + strdne r2, r3, [ip, #-8] + popne {r0, pc} + + tst r1, #4 // between 4 and 7 bytes? + ittt ne + strne r2, [r0, #-16] // overlapping store of 4 + 4 bytes + strne r2, [ip, #-4] + popne {r0, pc} + + tst r1, #1 // 1 or 3 bytes? + it ne + strbne r2, [r0, #-16] // store 1 byte + tst r1, #2 // 2 or 3 bytes? + it ne + strhne r2, [ip, #-2] // store 2 bytes + pop {r0, pc} diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm new file mode 100644 index 000000000000..969ded0c45e6 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm @@ -0,0 +1,74 @@ +;------------------------------------------------------------------------------ +; +; Copyright (c) 2016, Linaro Ltd. All rights reserved. +; +; This program and the accompanying materials are licensed and made available +; under the terms and conditions of the BSD License which accompanies this +; distribution. The full text of the license may be found at +; http://opensource.org/licenses/bsd-license.php +; +; THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS, +; WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. +; +;------------------------------------------------------------------------------ + + EXPORT InternalMemZeroMem + EXPORT InternalMemSetMem + EXPORT InternalMemSetMem16 + EXPORT InternalMemSetMem32 + EXPORT InternalMemSetMem64 + + AREA SetMem, CODE, READONLY + THUMB + +InternalMemZeroMem + movs r2, #0 + b InternalMemSetMem32 + +InternalMemSetMem + uxtb r2, r2 + orr r2, r2, r2, lsl #8 + +InternalMemSetMem16 + uxth r2, r2 + orr r2, r2, r2, lsr #16 + +InternalMemSetMem32 + mov r3, r2 + +InternalMemSetMem64 + push {r0, lr} + add ip, r0, r1 +L0 + adds r0, r0, #16 + cmp r0, ip + bgt L1 + strd r2, r3, [r0, #-16] + strd r2, r3, [ip, #-8] + bic r0, r0, #15 + b L0 + + // Set 0..15 bytes. +L1 + sub r1, ip, r0 + tst r1, #8 + ittt ne + strdne r2, r3, [r0, #-16] + strdne r2, r3, [ip, #-8] + popne {r0, pc} + + tst r1, #4 + ittt ne + strne r2, [r0, #-16] + strne r2, [ip, #-4] + popne {r0, pc} + + tst r1, #1 + it ne + strbne r2, [r0, #-16] + tst r1, #2 + it ne + strhne r2, [ip, #-2] + pop {r0, pc} + + END diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf index 71691b9859e3..d95eb599ea9e 100644 --- a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf +++ b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf @@ -27,7 +27,7 @@ [Defines] # -# VALID_ARCHITECTURES = IA32 X64 +# VALID_ARCHITECTURES = IA32 X64 ARM # [Sources] @@ -79,19 +79,6 @@ [Sources.Ia32] Ia32/CopyMem.nasm Ia32/CopyMem.asm Ia32/IsZeroBuffer.nasm - ScanMem64Wrapper.c - ScanMem32Wrapper.c - ScanMem16Wrapper.c - ScanMem8Wrapper.c - ZeroMemWrapper.c - CompareMemWrapper.c - SetMem64Wrapper.c - SetMem32Wrapper.c - SetMem16Wrapper.c - SetMemWrapper.c - CopyMemWrapper.c - IsZeroBufferWrapper.c - MemLibGuid.c [Sources.X64] X64/ScanMem64.nasm @@ -128,6 +115,21 @@ [Sources.X64] X64/CopyMem.asm X64/CopyMem.S X64/IsZeroBuffer.nasm + +[Sources.ARM] + Arm/ScanMem.S |GCC + Arm/SetMem.S |GCC + Arm/CopyMem.S |GCC + Arm/CompareMem.S |GCC + + Arm/ScanMem.asm |RVCT + Arm/SetMem.asm |RVCT + Arm/CopyMem.asm |RVCT + Arm/CompareMem.asm |RVCT + + Arm/ScanMemGeneric.c + +[Sources] ScanMem64Wrapper.c ScanMem32Wrapper.c ScanMem16Wrapper.c

[edk2,v2,2/3] MdePkg/BaseMemoryLibOptDxe: add accelerated ARM routines

Commit Message

Patch