Message ID | 20180315140229.7737-2-agraf@suse.de |
---|---|
State | Accepted |
Commit | 8e475064097d182191129e31846c585421f0f85a |
Headers | show |
Series | [1/2] efi_loader: Optimize GOP switch | expand |
On 03/15/2018 03:02 PM, Alexander Graf wrote: > The GOP path was optimized, but still not as fast as it should be. Let's > push it even further by trimming the hot path into simple 32bit load/store > operations for buf->vid 32bpp operations. > > Signed-off-by: Alexander Graf <agraf@suse.de> > --- > lib/efi_loader/efi_gop.c | 176 ++++++++++++++++++++++++++++++----------------- > 1 file changed, 114 insertions(+), 62 deletions(-) > > diff --git a/lib/efi_loader/efi_gop.c b/lib/efi_loader/efi_gop.c > index bbdf34e1dd..7b76e49ab0 100644 > --- a/lib/efi_loader/efi_gop.c > +++ b/lib/efi_loader/efi_gop.c > @@ -78,18 +78,20 @@ static inline u16 efi_blt_col_to_vid16(struct efi_gop_pixel *blt) > } > > static __always_inline efi_status_t gop_blt_int(struct efi_gop *this, > - struct efi_gop_pixel *buffer, > + struct efi_gop_pixel *bufferp, > u32 operation, efi_uintn_t sx, > efi_uintn_t sy, efi_uintn_t dx, > efi_uintn_t dy, > efi_uintn_t width, > efi_uintn_t height, > - efi_uintn_t delta) > + efi_uintn_t delta, > + efi_uintn_t vid_bpp) > { > struct efi_gop_obj *gopobj = container_of(this, struct efi_gop_obj, ops); > - efi_uintn_t i, j, linelen; > + efi_uintn_t i, j, linelen, slineoff = 0, dlineoff, swidth, dwidth; > u32 *fb32 = gopobj->fb; > u16 *fb16 = gopobj->fb; > + struct efi_gop_pixel *buffer = __builtin_assume_aligned(bufferp, 4); > > if (delta) { > /* Check for 4 byte alignment */ > @@ -133,6 +135,37 @@ static __always_inline efi_status_t gop_blt_int(struct efi_gop *this, > break; > } > > + /* Calculate line width */ > + switch (operation) { > + case EFI_BLT_BUFFER_TO_VIDEO: > + swidth = linelen; > + break; > + case EFI_BLT_VIDEO_TO_BLT_BUFFER: > + case EFI_BLT_VIDEO_TO_VIDEO: > + swidth = gopobj->info.width; > + if (!vid_bpp) > + return EFI_UNSUPPORTED; > + break; > + case EFI_BLT_VIDEO_FILL: > + swidth = 0; > + break; > + } > + > + switch (operation) { > + case EFI_BLT_BUFFER_TO_VIDEO: > + case EFI_BLT_VIDEO_FILL: > + case EFI_BLT_VIDEO_TO_VIDEO: > + dwidth = gopobj->info.width; > + if (!vid_bpp) > + return EFI_UNSUPPORTED; > + break; > + case EFI_BLT_VIDEO_TO_BLT_BUFFER: > + dwidth = linelen; > + break; > + } > + > + slineoff = swidth * sy; > + dlineoff = dwidth * dy; > for (i = 0; i < height; i++) { > for (j = 0; j < width; j++) { > struct efi_gop_pixel pix; > @@ -143,70 +176,65 @@ static __always_inline efi_status_t gop_blt_int(struct efi_gop *this, > pix = *buffer; > break; > case EFI_BLT_BUFFER_TO_VIDEO: > - pix = buffer[linelen * (i + sy) + j + sx]; > + pix = buffer[slineoff + j + sx]; > break; > case EFI_BLT_VIDEO_TO_BLT_BUFFER: > case EFI_BLT_VIDEO_TO_VIDEO: > - switch (gopobj->bpix) { > -#ifdef CONFIG_DM_VIDEO > - case VIDEO_BPP32: > -#else > - case LCD_COLOR32: > -#endif > + if (vid_bpp == 32) > pix = *(struct efi_gop_pixel *)&fb32[ > - gopobj->info.width * > - (i + sy) + j + sx]; > - break; > -#ifdef CONFIG_DM_VIDEO > - case VIDEO_BPP16: > -#else > - case LCD_COLOR16: > -#endif > + slineoff + j + sx]; > + else > pix = efi_vid16_to_blt_col(fb16[ Shouldn't we eliminate this conversion for EFI_BLT_VIDEO_TO_VIDEO? > - gopobj->info.width * > - (i + sy) + j + sx]); > - break; > - default: > - return EFI_UNSUPPORTED; > - } > + slineoff + j + sx]); > break; > } > > /* Write destination pixel */ > switch (operation) { > case EFI_BLT_VIDEO_TO_BLT_BUFFER: > - buffer[linelen * (i + dy) + j + dx] = pix; > + does buffer[dlineoff + j + dx] = pix; > break; > case EFI_BLT_BUFFER_TO_VIDEO: > case EFI_BLT_VIDEO_FILL: > case EFI_BLT_VIDEO_TO_VIDEO: > - switch (gopobj->bpix) { > + if (vid_bpp == 32) > + fb32[dlineoff + j + dx] = *(u32 *)&pix; > + else > + fb16[dlineoff + j + dx] = > + efi_blt_col_to_vid16(&pix); Same here. The following problem is not introduced by your patch series so it should not stop you from merging the patches: For EFI_BLT_VIDEO_TO_VIDEO the spec does not define how to copy overlapping rectangles. The iteration direction makes a big difference here. I think we should do overlapping copies always in a way that keeps the contents of the source rectangle. Currently we corrupt it when sy == dy && sx < dx < sx + width || sy < dy < dy + height. For dy > sy we should iterate bottom to top. For dx > sy && dy == sy we should iterate right to left. Best regards Heinrich > + break; > + } > + } > + slineoff += swidth; > + dlineoff += dwidth; > + } > + > + return EFI_SUCCESS; > +} > + > +static efi_uintn_t gop_get_bpp(struct efi_gop *this) > +{ > + struct efi_gop_obj *gopobj = container_of(this, struct efi_gop_obj, ops); > + efi_uintn_t vid_bpp = 0; > + > + switch (gopobj->bpix) { > #ifdef CONFIG_DM_VIDEO > - case VIDEO_BPP32: > + case VIDEO_BPP32: > #else > - case LCD_COLOR32: > + case LCD_COLOR32: > #endif > - fb32[gopobj->info.width * > - (i + dy) + j + dx] = *(u32 *)&pix; > - break; > + vid_bpp = 32; > + break; > #ifdef CONFIG_DM_VIDEO > - case VIDEO_BPP16: > + case VIDEO_BPP16: > #else > - case LCD_COLOR16: > + case LCD_COLOR16: > #endif > - fb16[gopobj->info.width * > - (i + dy) + j + dx] = > - efi_blt_col_to_vid16(&pix); > - break; > - default: > - return EFI_UNSUPPORTED; > - } > - break; > - } > - } > + vid_bpp = 16; > + break; > } > > - return EFI_SUCCESS; > + return vid_bpp; > } > > /* > @@ -223,21 +251,33 @@ static efi_status_t gop_blt_video_fill(struct efi_gop *this, > u32 foo, efi_uintn_t sx, > efi_uintn_t sy, efi_uintn_t dx, > efi_uintn_t dy, efi_uintn_t width, > - efi_uintn_t height, efi_uintn_t delta) > + efi_uintn_t height, efi_uintn_t delta, > + efi_uintn_t vid_bpp) > { > return gop_blt_int(this, buffer, EFI_BLT_VIDEO_FILL, sx, sy, dx, > - dy, width, height, delta); > + dy, width, height, delta, vid_bpp); > } > > -static efi_status_t gop_blt_buf_to_vid(struct efi_gop *this, > - struct efi_gop_pixel *buffer, > - u32 foo, efi_uintn_t sx, > - efi_uintn_t sy, efi_uintn_t dx, > - efi_uintn_t dy, efi_uintn_t width, > - efi_uintn_t height, efi_uintn_t delta) > +static efi_status_t gop_blt_buf_to_vid16(struct efi_gop *this, > + struct efi_gop_pixel *buffer, > + u32 foo, efi_uintn_t sx, > + efi_uintn_t sy, efi_uintn_t dx, > + efi_uintn_t dy, efi_uintn_t width, > + efi_uintn_t height, efi_uintn_t delta) > { > return gop_blt_int(this, buffer, EFI_BLT_BUFFER_TO_VIDEO, sx, sy, dx, > - dy, width, height, delta); > + dy, width, height, delta, 16); > +} > + > +static efi_status_t gop_blt_buf_to_vid32(struct efi_gop *this, > + struct efi_gop_pixel *buffer, > + u32 foo, efi_uintn_t sx, > + efi_uintn_t sy, efi_uintn_t dx, > + efi_uintn_t dy, efi_uintn_t width, > + efi_uintn_t height, efi_uintn_t delta) > +{ > + return gop_blt_int(this, buffer, EFI_BLT_BUFFER_TO_VIDEO, sx, sy, dx, > + dy, width, height, delta, 32); > } > > static efi_status_t gop_blt_vid_to_vid(struct efi_gop *this, > @@ -245,10 +285,11 @@ static efi_status_t gop_blt_vid_to_vid(struct efi_gop *this, > u32 foo, efi_uintn_t sx, > efi_uintn_t sy, efi_uintn_t dx, > efi_uintn_t dy, efi_uintn_t width, > - efi_uintn_t height, efi_uintn_t delta) > + efi_uintn_t height, efi_uintn_t delta, > + efi_uintn_t vid_bpp) > { > return gop_blt_int(this, buffer, EFI_BLT_VIDEO_TO_VIDEO, sx, sy, dx, > - dy, width, height, delta); > + dy, width, height, delta, vid_bpp); > } > > static efi_status_t gop_blt_vid_to_buf(struct efi_gop *this, > @@ -256,10 +297,11 @@ static efi_status_t gop_blt_vid_to_buf(struct efi_gop *this, > u32 foo, efi_uintn_t sx, > efi_uintn_t sy, efi_uintn_t dx, > efi_uintn_t dy, efi_uintn_t width, > - efi_uintn_t height, efi_uintn_t delta) > + efi_uintn_t height, efi_uintn_t delta, > + efi_uintn_t vid_bpp) > { > return gop_blt_int(this, buffer, EFI_BLT_VIDEO_TO_BLT_BUFFER, sx, sy, > - dx, dy, width, height, delta); > + dx, dy, width, height, delta, vid_bpp); > } > > /* > @@ -287,27 +329,37 @@ efi_status_t EFIAPI gop_blt(struct efi_gop *this, struct efi_gop_pixel *buffer, > efi_uintn_t height, efi_uintn_t delta) > { > efi_status_t ret = EFI_INVALID_PARAMETER; > + efi_uintn_t vid_bpp; > > EFI_ENTRY("%p, %p, %u, %zu, %zu, %zu, %zu, %zu, %zu, %zu", this, > buffer, operation, sx, sy, dx, dy, width, height, delta); > > + vid_bpp = gop_get_bpp(this); > + > /* Allow for compiler optimization */ > switch (operation) { > case EFI_BLT_VIDEO_FILL: > ret = gop_blt_video_fill(this, buffer, operation, sx, sy, dx, > - dy, width, height, delta); > + dy, width, height, delta, vid_bpp); > break; > case EFI_BLT_BUFFER_TO_VIDEO: > - ret = gop_blt_buf_to_vid(this, buffer, operation, sx, sy, dx, > - dy, width, height, delta); > + /* This needs to be super-fast, so duplicate for 16/32bpp */ > + if (vid_bpp == 32) > + ret = gop_blt_buf_to_vid32(this, buffer, operation, sx, > + sy, dx, dy, width, height, > + delta); > + else > + ret = gop_blt_buf_to_vid16(this, buffer, operation, sx, > + sy, dx, dy, width, height, > + delta); > break; > case EFI_BLT_VIDEO_TO_VIDEO: > ret = gop_blt_vid_to_vid(this, buffer, operation, sx, sy, dx, > - dy, width, height, delta); > + dy, width, height, delta, vid_bpp); > break; > case EFI_BLT_VIDEO_TO_BLT_BUFFER: > ret = gop_blt_vid_to_buf(this, buffer, operation, sx, sy, dx, > - dy, width, height, delta); > + dy, width, height, delta, vid_bpp); > break; > default: > ret = EFI_UNSUPPORTED; >
On 03/16/2018 11:55 AM, Heinrich Schuchardt wrote: > > > On 03/15/2018 03:02 PM, Alexander Graf wrote: >> The GOP path was optimized, but still not as fast as it should be. Let's >> push it even further by trimming the hot path into simple 32bit >> load/store >> operations for buf->vid 32bpp operations. >> >> Signed-off-by: Alexander Graf <agraf@suse.de> >> --- <snip /> > > The following problem is not introduced by your patch series so it > should not stop you from merging the patches: > > For EFI_BLT_VIDEO_TO_VIDEO the spec does not define how to copy > overlapping rectangles. The iteration direction makes a big difference > here. > > I think we should do overlapping copies always in a way that keeps the > contents of the source rectangle. Currently we corrupt it when > > sy == dy && sx < dx < sx + width || sy < dy < dy + height. > > For dy > sy we should iterate bottom to top. For dx > sy && dy == sy we > should iterate right to left. > > Best regards > > Heinrich > EDK has said logic in CorebootPayloadPkg/FbGop/FbGop.c. Regards Heinrich
> The GOP path was optimized, but still not as fast as it should be. Let's > push it even further by trimming the hot path into simple 32bit load/store > operations for buf->vid 32bpp operations. > > Signed-off-by: Alexander Graf <agraf@suse.de> Thanks, applied to efi-next Alex
diff --git a/lib/efi_loader/efi_gop.c b/lib/efi_loader/efi_gop.c index bbdf34e1dd..7b76e49ab0 100644 --- a/lib/efi_loader/efi_gop.c +++ b/lib/efi_loader/efi_gop.c @@ -78,18 +78,20 @@ static inline u16 efi_blt_col_to_vid16(struct efi_gop_pixel *blt) } static __always_inline efi_status_t gop_blt_int(struct efi_gop *this, - struct efi_gop_pixel *buffer, + struct efi_gop_pixel *bufferp, u32 operation, efi_uintn_t sx, efi_uintn_t sy, efi_uintn_t dx, efi_uintn_t dy, efi_uintn_t width, efi_uintn_t height, - efi_uintn_t delta) + efi_uintn_t delta, + efi_uintn_t vid_bpp) { struct efi_gop_obj *gopobj = container_of(this, struct efi_gop_obj, ops); - efi_uintn_t i, j, linelen; + efi_uintn_t i, j, linelen, slineoff = 0, dlineoff, swidth, dwidth; u32 *fb32 = gopobj->fb; u16 *fb16 = gopobj->fb; + struct efi_gop_pixel *buffer = __builtin_assume_aligned(bufferp, 4); if (delta) { /* Check for 4 byte alignment */ @@ -133,6 +135,37 @@ static __always_inline efi_status_t gop_blt_int(struct efi_gop *this, break; } + /* Calculate line width */ + switch (operation) { + case EFI_BLT_BUFFER_TO_VIDEO: + swidth = linelen; + break; + case EFI_BLT_VIDEO_TO_BLT_BUFFER: + case EFI_BLT_VIDEO_TO_VIDEO: + swidth = gopobj->info.width; + if (!vid_bpp) + return EFI_UNSUPPORTED; + break; + case EFI_BLT_VIDEO_FILL: + swidth = 0; + break; + } + + switch (operation) { + case EFI_BLT_BUFFER_TO_VIDEO: + case EFI_BLT_VIDEO_FILL: + case EFI_BLT_VIDEO_TO_VIDEO: + dwidth = gopobj->info.width; + if (!vid_bpp) + return EFI_UNSUPPORTED; + break; + case EFI_BLT_VIDEO_TO_BLT_BUFFER: + dwidth = linelen; + break; + } + + slineoff = swidth * sy; + dlineoff = dwidth * dy; for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { struct efi_gop_pixel pix; @@ -143,70 +176,65 @@ static __always_inline efi_status_t gop_blt_int(struct efi_gop *this, pix = *buffer; break; case EFI_BLT_BUFFER_TO_VIDEO: - pix = buffer[linelen * (i + sy) + j + sx]; + pix = buffer[slineoff + j + sx]; break; case EFI_BLT_VIDEO_TO_BLT_BUFFER: case EFI_BLT_VIDEO_TO_VIDEO: - switch (gopobj->bpix) { -#ifdef CONFIG_DM_VIDEO - case VIDEO_BPP32: -#else - case LCD_COLOR32: -#endif + if (vid_bpp == 32) pix = *(struct efi_gop_pixel *)&fb32[ - gopobj->info.width * - (i + sy) + j + sx]; - break; -#ifdef CONFIG_DM_VIDEO - case VIDEO_BPP16: -#else - case LCD_COLOR16: -#endif + slineoff + j + sx]; + else pix = efi_vid16_to_blt_col(fb16[ - gopobj->info.width * - (i + sy) + j + sx]); - break; - default: - return EFI_UNSUPPORTED; - } + slineoff + j + sx]); break; } /* Write destination pixel */ switch (operation) { case EFI_BLT_VIDEO_TO_BLT_BUFFER: - buffer[linelen * (i + dy) + j + dx] = pix; + buffer[dlineoff + j + dx] = pix; break; case EFI_BLT_BUFFER_TO_VIDEO: case EFI_BLT_VIDEO_FILL: case EFI_BLT_VIDEO_TO_VIDEO: - switch (gopobj->bpix) { + if (vid_bpp == 32) + fb32[dlineoff + j + dx] = *(u32 *)&pix; + else + fb16[dlineoff + j + dx] = + efi_blt_col_to_vid16(&pix); + break; + } + } + slineoff += swidth; + dlineoff += dwidth; + } + + return EFI_SUCCESS; +} + +static efi_uintn_t gop_get_bpp(struct efi_gop *this) +{ + struct efi_gop_obj *gopobj = container_of(this, struct efi_gop_obj, ops); + efi_uintn_t vid_bpp = 0; + + switch (gopobj->bpix) { #ifdef CONFIG_DM_VIDEO - case VIDEO_BPP32: + case VIDEO_BPP32: #else - case LCD_COLOR32: + case LCD_COLOR32: #endif - fb32[gopobj->info.width * - (i + dy) + j + dx] = *(u32 *)&pix; - break; + vid_bpp = 32; + break; #ifdef CONFIG_DM_VIDEO - case VIDEO_BPP16: + case VIDEO_BPP16: #else - case LCD_COLOR16: + case LCD_COLOR16: #endif - fb16[gopobj->info.width * - (i + dy) + j + dx] = - efi_blt_col_to_vid16(&pix); - break; - default: - return EFI_UNSUPPORTED; - } - break; - } - } + vid_bpp = 16; + break; } - return EFI_SUCCESS; + return vid_bpp; } /* @@ -223,21 +251,33 @@ static efi_status_t gop_blt_video_fill(struct efi_gop *this, u32 foo, efi_uintn_t sx, efi_uintn_t sy, efi_uintn_t dx, efi_uintn_t dy, efi_uintn_t width, - efi_uintn_t height, efi_uintn_t delta) + efi_uintn_t height, efi_uintn_t delta, + efi_uintn_t vid_bpp) { return gop_blt_int(this, buffer, EFI_BLT_VIDEO_FILL, sx, sy, dx, - dy, width, height, delta); + dy, width, height, delta, vid_bpp); } -static efi_status_t gop_blt_buf_to_vid(struct efi_gop *this, - struct efi_gop_pixel *buffer, - u32 foo, efi_uintn_t sx, - efi_uintn_t sy, efi_uintn_t dx, - efi_uintn_t dy, efi_uintn_t width, - efi_uintn_t height, efi_uintn_t delta) +static efi_status_t gop_blt_buf_to_vid16(struct efi_gop *this, + struct efi_gop_pixel *buffer, + u32 foo, efi_uintn_t sx, + efi_uintn_t sy, efi_uintn_t dx, + efi_uintn_t dy, efi_uintn_t width, + efi_uintn_t height, efi_uintn_t delta) { return gop_blt_int(this, buffer, EFI_BLT_BUFFER_TO_VIDEO, sx, sy, dx, - dy, width, height, delta); + dy, width, height, delta, 16); +} + +static efi_status_t gop_blt_buf_to_vid32(struct efi_gop *this, + struct efi_gop_pixel *buffer, + u32 foo, efi_uintn_t sx, + efi_uintn_t sy, efi_uintn_t dx, + efi_uintn_t dy, efi_uintn_t width, + efi_uintn_t height, efi_uintn_t delta) +{ + return gop_blt_int(this, buffer, EFI_BLT_BUFFER_TO_VIDEO, sx, sy, dx, + dy, width, height, delta, 32); } static efi_status_t gop_blt_vid_to_vid(struct efi_gop *this, @@ -245,10 +285,11 @@ static efi_status_t gop_blt_vid_to_vid(struct efi_gop *this, u32 foo, efi_uintn_t sx, efi_uintn_t sy, efi_uintn_t dx, efi_uintn_t dy, efi_uintn_t width, - efi_uintn_t height, efi_uintn_t delta) + efi_uintn_t height, efi_uintn_t delta, + efi_uintn_t vid_bpp) { return gop_blt_int(this, buffer, EFI_BLT_VIDEO_TO_VIDEO, sx, sy, dx, - dy, width, height, delta); + dy, width, height, delta, vid_bpp); } static efi_status_t gop_blt_vid_to_buf(struct efi_gop *this, @@ -256,10 +297,11 @@ static efi_status_t gop_blt_vid_to_buf(struct efi_gop *this, u32 foo, efi_uintn_t sx, efi_uintn_t sy, efi_uintn_t dx, efi_uintn_t dy, efi_uintn_t width, - efi_uintn_t height, efi_uintn_t delta) + efi_uintn_t height, efi_uintn_t delta, + efi_uintn_t vid_bpp) { return gop_blt_int(this, buffer, EFI_BLT_VIDEO_TO_BLT_BUFFER, sx, sy, - dx, dy, width, height, delta); + dx, dy, width, height, delta, vid_bpp); } /* @@ -287,27 +329,37 @@ efi_status_t EFIAPI gop_blt(struct efi_gop *this, struct efi_gop_pixel *buffer, efi_uintn_t height, efi_uintn_t delta) { efi_status_t ret = EFI_INVALID_PARAMETER; + efi_uintn_t vid_bpp; EFI_ENTRY("%p, %p, %u, %zu, %zu, %zu, %zu, %zu, %zu, %zu", this, buffer, operation, sx, sy, dx, dy, width, height, delta); + vid_bpp = gop_get_bpp(this); + /* Allow for compiler optimization */ switch (operation) { case EFI_BLT_VIDEO_FILL: ret = gop_blt_video_fill(this, buffer, operation, sx, sy, dx, - dy, width, height, delta); + dy, width, height, delta, vid_bpp); break; case EFI_BLT_BUFFER_TO_VIDEO: - ret = gop_blt_buf_to_vid(this, buffer, operation, sx, sy, dx, - dy, width, height, delta); + /* This needs to be super-fast, so duplicate for 16/32bpp */ + if (vid_bpp == 32) + ret = gop_blt_buf_to_vid32(this, buffer, operation, sx, + sy, dx, dy, width, height, + delta); + else + ret = gop_blt_buf_to_vid16(this, buffer, operation, sx, + sy, dx, dy, width, height, + delta); break; case EFI_BLT_VIDEO_TO_VIDEO: ret = gop_blt_vid_to_vid(this, buffer, operation, sx, sy, dx, - dy, width, height, delta); + dy, width, height, delta, vid_bpp); break; case EFI_BLT_VIDEO_TO_BLT_BUFFER: ret = gop_blt_vid_to_buf(this, buffer, operation, sx, sy, dx, - dy, width, height, delta); + dy, width, height, delta, vid_bpp); break; default: ret = EFI_UNSUPPORTED;
The GOP path was optimized, but still not as fast as it should be. Let's push it even further by trimming the hot path into simple 32bit load/store operations for buf->vid 32bpp operations. Signed-off-by: Alexander Graf <agraf@suse.de> --- lib/efi_loader/efi_gop.c | 176 ++++++++++++++++++++++++++++++----------------- 1 file changed, 114 insertions(+), 62 deletions(-)