diff mbox series

[2/2] efi_loader: Optimize GOP more

Message ID 20180315140229.7737-2-agraf@suse.de
State Accepted
Commit 8e475064097d182191129e31846c585421f0f85a
Headers show
Series [1/2] efi_loader: Optimize GOP switch | expand

Commit Message

Alexander Graf March 15, 2018, 2:02 p.m. UTC
The GOP path was optimized, but still not as fast as it should be. Let's
push it even further by trimming the hot path into simple 32bit load/store
operations for buf->vid 32bpp operations.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 lib/efi_loader/efi_gop.c | 176 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 114 insertions(+), 62 deletions(-)

Comments

Heinrich Schuchardt March 16, 2018, 10:55 a.m. UTC | #1
On 03/15/2018 03:02 PM, Alexander Graf wrote:
> The GOP path was optimized, but still not as fast as it should be. Let's
> push it even further by trimming the hot path into simple 32bit load/store
> operations for buf->vid 32bpp operations.
> 
> Signed-off-by: Alexander Graf <agraf@suse.de>
> ---
>   lib/efi_loader/efi_gop.c | 176 ++++++++++++++++++++++++++++++-----------------
>   1 file changed, 114 insertions(+), 62 deletions(-)
> 
> diff --git a/lib/efi_loader/efi_gop.c b/lib/efi_loader/efi_gop.c
> index bbdf34e1dd..7b76e49ab0 100644
> --- a/lib/efi_loader/efi_gop.c
> +++ b/lib/efi_loader/efi_gop.c
> @@ -78,18 +78,20 @@ static inline u16 efi_blt_col_to_vid16(struct efi_gop_pixel *blt)
>   }
>   
>   static __always_inline efi_status_t gop_blt_int(struct efi_gop *this,
> -						struct efi_gop_pixel *buffer,
> +						struct efi_gop_pixel *bufferp,
>   						u32 operation, efi_uintn_t sx,
>   						efi_uintn_t sy, efi_uintn_t dx,
>   						efi_uintn_t dy,
>   						efi_uintn_t width,
>   						efi_uintn_t height,
> -						efi_uintn_t delta)
> +						efi_uintn_t delta,
> +						efi_uintn_t vid_bpp)
>   {
>   	struct efi_gop_obj *gopobj = container_of(this, struct efi_gop_obj, ops);
> -	efi_uintn_t i, j, linelen;
> +	efi_uintn_t i, j, linelen, slineoff = 0, dlineoff, swidth, dwidth;
>   	u32 *fb32 = gopobj->fb;
>   	u16 *fb16 = gopobj->fb;
> +	struct efi_gop_pixel *buffer = __builtin_assume_aligned(bufferp, 4);
>   
>   	if (delta) {
>   		/* Check for 4 byte alignment */
> @@ -133,6 +135,37 @@ static __always_inline efi_status_t gop_blt_int(struct efi_gop *this,
>   		break;
>   	}
>   
> +	/* Calculate line width */
> +	switch (operation) {
> +	case EFI_BLT_BUFFER_TO_VIDEO:
> +		swidth = linelen;
> +		break;
> +	case EFI_BLT_VIDEO_TO_BLT_BUFFER:
> +	case EFI_BLT_VIDEO_TO_VIDEO:
> +		swidth = gopobj->info.width;
> +		if (!vid_bpp)
> +			return EFI_UNSUPPORTED;
> +		break;
> +	case EFI_BLT_VIDEO_FILL:
> +		swidth = 0;
> +		break;
> +	}
> +
> +	switch (operation) {
> +	case EFI_BLT_BUFFER_TO_VIDEO:
> +	case EFI_BLT_VIDEO_FILL:
> +	case EFI_BLT_VIDEO_TO_VIDEO:
> +		dwidth = gopobj->info.width;
> +		if (!vid_bpp)
> +			return EFI_UNSUPPORTED;
> +		break;
> +	case EFI_BLT_VIDEO_TO_BLT_BUFFER:
> +		dwidth = linelen;
> +		break;
> +	}
> +
> +	slineoff = swidth * sy;
> +	dlineoff = dwidth * dy;
>   	for (i = 0; i < height; i++) {
>   		for (j = 0; j < width; j++) {
>   			struct efi_gop_pixel pix;
> @@ -143,70 +176,65 @@ static __always_inline efi_status_t gop_blt_int(struct efi_gop *this,
>   				pix = *buffer;
>   				break;
>   			case EFI_BLT_BUFFER_TO_VIDEO:
> -				pix = buffer[linelen * (i + sy) + j + sx];
> +				pix = buffer[slineoff + j + sx];
>   				break;
>   			case EFI_BLT_VIDEO_TO_BLT_BUFFER:
>   			case EFI_BLT_VIDEO_TO_VIDEO:
> -				switch (gopobj->bpix) {
> -#ifdef CONFIG_DM_VIDEO
> -				case VIDEO_BPP32:
> -#else
> -				case LCD_COLOR32:
> -#endif
> +				if (vid_bpp == 32)
>   					pix = *(struct efi_gop_pixel *)&fb32[
> -						gopobj->info.width *
> -						(i + sy) + j + sx];
> -				break;
> -#ifdef CONFIG_DM_VIDEO
> -				case VIDEO_BPP16:
> -#else
> -				case LCD_COLOR16:
> -#endif
> +						slineoff + j + sx];
> +				else
>   					pix = efi_vid16_to_blt_col(fb16[

Shouldn't we eliminate this conversion for EFI_BLT_VIDEO_TO_VIDEO?

> -						gopobj->info.width *
> -						(i + sy) + j + sx]);
> -					break;
> -				default:
> -					return EFI_UNSUPPORTED;
> -				}
> +						slineoff + j + sx]);
>   				break;
>   			}
>   
>   			/* Write destination pixel */
>   			switch (operation) {
>   			case EFI_BLT_VIDEO_TO_BLT_BUFFER:
> -				buffer[linelen * (i + dy) + j + dx] = pix;
> +			does	buffer[dlineoff + j + dx] = pix;
>   				break;
>   			case EFI_BLT_BUFFER_TO_VIDEO:
>   			case EFI_BLT_VIDEO_FILL:
>   			case EFI_BLT_VIDEO_TO_VIDEO:
> -				switch (gopobj->bpix) {
> +				if (vid_bpp == 32)
> +					fb32[dlineoff + j + dx] = *(u32 *)&pix;
> +				else
> +					fb16[dlineoff + j + dx] =
> +						efi_blt_col_to_vid16(&pix);

Same here.


The following problem is not introduced by your patch series so it 
should not stop you from merging the patches:

For EFI_BLT_VIDEO_TO_VIDEO the spec does not define how to copy 
overlapping rectangles. The iteration direction makes a big difference here.

I think we should do overlapping copies always in a way that keeps the 
contents of the source rectangle. Currently we corrupt it when

sy == dy && sx < dx < sx + width || sy < dy < dy + height.

For dy > sy we should iterate bottom to top. For dx > sy && dy == sy we 
should iterate right to left.

Best regards

Heinrich

> +				break;
> +			}
> +		}
> +		slineoff += swidth;
> +		dlineoff += dwidth;
> +	}
> +
> +	return EFI_SUCCESS;
> +}
> +
> +static efi_uintn_t gop_get_bpp(struct efi_gop *this)
> +{
> +	struct efi_gop_obj *gopobj = container_of(this, struct efi_gop_obj, ops);
> +	efi_uintn_t vid_bpp = 0;
> +
> +	switch (gopobj->bpix) {
>   #ifdef CONFIG_DM_VIDEO
> -				case VIDEO_BPP32:
> +	case VIDEO_BPP32:
>   #else
> -				case LCD_COLOR32:
> +	case LCD_COLOR32:
>   #endif
> -					fb32[gopobj->info.width *
> -					     (i + dy) + j + dx] = *(u32 *)&pix;
> -					break;
> +		vid_bpp = 32;
> +		break;
>   #ifdef CONFIG_DM_VIDEO
> -				case VIDEO_BPP16:
> +	case VIDEO_BPP16:
>   #else
> -				case LCD_COLOR16:
> +	case LCD_COLOR16:
>   #endif
> -					fb16[gopobj->info.width *
> -					     (i + dy) + j + dx] =
> -						efi_blt_col_to_vid16(&pix);
> -					break;
> -				default:
> -					return EFI_UNSUPPORTED;
> -				}
> -				break;
> -			}
> -		}
> +		vid_bpp = 16;
> +		break;
>   	}
>   
> -	return EFI_SUCCESS;
> +	return vid_bpp;
>   }
>   
>   /*
> @@ -223,21 +251,33 @@ static efi_status_t gop_blt_video_fill(struct efi_gop *this,
>   				       u32 foo, efi_uintn_t sx,
>   				       efi_uintn_t sy, efi_uintn_t dx,
>   				       efi_uintn_t dy, efi_uintn_t width,
> -				       efi_uintn_t height, efi_uintn_t delta)
> +				       efi_uintn_t height, efi_uintn_t delta,
> +				       efi_uintn_t vid_bpp)
>   {
>   	return gop_blt_int(this, buffer, EFI_BLT_VIDEO_FILL, sx, sy, dx,
> -			   dy, width, height, delta);
> +			   dy, width, height, delta, vid_bpp);
>   }
>   
> -static efi_status_t gop_blt_buf_to_vid(struct efi_gop *this,
> -				       struct efi_gop_pixel *buffer,
> -				       u32 foo, efi_uintn_t sx,
> -				       efi_uintn_t sy, efi_uintn_t dx,
> -				       efi_uintn_t dy, efi_uintn_t width,
> -				       efi_uintn_t height, efi_uintn_t delta)
> +static efi_status_t gop_blt_buf_to_vid16(struct efi_gop *this,
> +					 struct efi_gop_pixel *buffer,
> +					 u32 foo, efi_uintn_t sx,
> +					 efi_uintn_t sy, efi_uintn_t dx,
> +					 efi_uintn_t dy, efi_uintn_t width,
> +					 efi_uintn_t height, efi_uintn_t delta)
>   {
>   	return gop_blt_int(this, buffer, EFI_BLT_BUFFER_TO_VIDEO, sx, sy, dx,
> -			   dy, width, height, delta);
> +			   dy, width, height, delta, 16);
> +}
> +
> +static efi_status_t gop_blt_buf_to_vid32(struct efi_gop *this,
> +					 struct efi_gop_pixel *buffer,
> +					 u32 foo, efi_uintn_t sx,
> +					 efi_uintn_t sy, efi_uintn_t dx,
> +					 efi_uintn_t dy, efi_uintn_t width,
> +					 efi_uintn_t height, efi_uintn_t delta)
> +{
> +	return gop_blt_int(this, buffer, EFI_BLT_BUFFER_TO_VIDEO, sx, sy, dx,
> +			   dy, width, height, delta, 32);
>   }
>   
>   static efi_status_t gop_blt_vid_to_vid(struct efi_gop *this,
> @@ -245,10 +285,11 @@ static efi_status_t gop_blt_vid_to_vid(struct efi_gop *this,
>   				       u32 foo, efi_uintn_t sx,
>   				       efi_uintn_t sy, efi_uintn_t dx,
>   				       efi_uintn_t dy, efi_uintn_t width,
> -				       efi_uintn_t height, efi_uintn_t delta)
> +				       efi_uintn_t height, efi_uintn_t delta,
> +				       efi_uintn_t vid_bpp)
>   {
>   	return gop_blt_int(this, buffer, EFI_BLT_VIDEO_TO_VIDEO, sx, sy, dx,
> -			   dy, width, height, delta);
> +			   dy, width, height, delta, vid_bpp);
>   }
>   
>   static efi_status_t gop_blt_vid_to_buf(struct efi_gop *this,
> @@ -256,10 +297,11 @@ static efi_status_t gop_blt_vid_to_buf(struct efi_gop *this,
>   				       u32 foo, efi_uintn_t sx,
>   				       efi_uintn_t sy, efi_uintn_t dx,
>   				       efi_uintn_t dy, efi_uintn_t width,
> -				       efi_uintn_t height, efi_uintn_t delta)
> +				       efi_uintn_t height, efi_uintn_t delta,
> +				       efi_uintn_t vid_bpp)
>   {
>   	return gop_blt_int(this, buffer, EFI_BLT_VIDEO_TO_BLT_BUFFER, sx, sy,
> -			   dx, dy, width, height, delta);
> +			   dx, dy, width, height, delta, vid_bpp);
>   }
>   
>   /*
> @@ -287,27 +329,37 @@ efi_status_t EFIAPI gop_blt(struct efi_gop *this, struct efi_gop_pixel *buffer,
>   			    efi_uintn_t height, efi_uintn_t delta)
>   {
>   	efi_status_t ret = EFI_INVALID_PARAMETER;
> +	efi_uintn_t vid_bpp;
>   
>   	EFI_ENTRY("%p, %p, %u, %zu, %zu, %zu, %zu, %zu, %zu, %zu", this,
>   		  buffer, operation, sx, sy, dx, dy, width, height, delta);
>   
> +	vid_bpp = gop_get_bpp(this);
> +
>   	/* Allow for compiler optimization */
>   	switch (operation) {
>   	case EFI_BLT_VIDEO_FILL:
>   		ret = gop_blt_video_fill(this, buffer, operation, sx, sy, dx,
> -					 dy, width, height, delta);
> +					 dy, width, height, delta, vid_bpp);
>   		break;
>   	case EFI_BLT_BUFFER_TO_VIDEO:
> -		ret = gop_blt_buf_to_vid(this, buffer, operation, sx, sy, dx,
> -					 dy, width, height, delta);
> +		/* This needs to be super-fast, so duplicate for 16/32bpp */
> +		if (vid_bpp == 32)
> +			ret = gop_blt_buf_to_vid32(this, buffer, operation, sx,
> +						   sy, dx, dy, width, height,
> +						   delta);
> +		else
> +			ret = gop_blt_buf_to_vid16(this, buffer, operation, sx,
> +						   sy, dx, dy, width, height,
> +						   delta);
>   		break;
>   	case EFI_BLT_VIDEO_TO_VIDEO:
>   		ret = gop_blt_vid_to_vid(this, buffer, operation, sx, sy, dx,
> -					 dy, width, height, delta);
> +					 dy, width, height, delta, vid_bpp);
>   		break;
>   	case EFI_BLT_VIDEO_TO_BLT_BUFFER:
>   		ret = gop_blt_vid_to_buf(this, buffer, operation, sx, sy, dx,
> -					 dy, width, height, delta);
> +					 dy, width, height, delta, vid_bpp);
>   		break;
>   	default:
>   		ret = EFI_UNSUPPORTED;
>
Heinrich Schuchardt March 16, 2018, 2:08 p.m. UTC | #2
On 03/16/2018 11:55 AM, Heinrich Schuchardt wrote:
> 
> 
> On 03/15/2018 03:02 PM, Alexander Graf wrote:
>> The GOP path was optimized, but still not as fast as it should be. Let's
>> push it even further by trimming the hot path into simple 32bit 
>> load/store
>> operations for buf->vid 32bpp operations.
>>
>> Signed-off-by: Alexander Graf <agraf@suse.de>
>> ---
<snip />
> 
> The following problem is not introduced by your patch series so it 
> should not stop you from merging the patches:
> 
> For EFI_BLT_VIDEO_TO_VIDEO the spec does not define how to copy 
> overlapping rectangles. The iteration direction makes a big difference 
> here.
> 
> I think we should do overlapping copies always in a way that keeps the 
> contents of the source rectangle. Currently we corrupt it when
> 
> sy == dy && sx < dx < sx + width || sy < dy < dy + height.
> 
> For dy > sy we should iterate bottom to top. For dx > sy && dy == sy we 
> should iterate right to left.
> 
> Best regards
> 
> Heinrich
> 

EDK has said logic in CorebootPayloadPkg/FbGop/FbGop.c.

Regards

Heinrich
Alexander Graf April 4, 2018, 9:51 a.m. UTC | #3
> The GOP path was optimized, but still not as fast as it should be. Let's
> push it even further by trimming the hot path into simple 32bit load/store
> operations for buf->vid 32bpp operations.
> 
> Signed-off-by: Alexander Graf <agraf@suse.de>

Thanks, applied to efi-next

Alex
diff mbox series

Patch

diff --git a/lib/efi_loader/efi_gop.c b/lib/efi_loader/efi_gop.c
index bbdf34e1dd..7b76e49ab0 100644
--- a/lib/efi_loader/efi_gop.c
+++ b/lib/efi_loader/efi_gop.c
@@ -78,18 +78,20 @@  static inline u16 efi_blt_col_to_vid16(struct efi_gop_pixel *blt)
 }
 
 static __always_inline efi_status_t gop_blt_int(struct efi_gop *this,
-						struct efi_gop_pixel *buffer,
+						struct efi_gop_pixel *bufferp,
 						u32 operation, efi_uintn_t sx,
 						efi_uintn_t sy, efi_uintn_t dx,
 						efi_uintn_t dy,
 						efi_uintn_t width,
 						efi_uintn_t height,
-						efi_uintn_t delta)
+						efi_uintn_t delta,
+						efi_uintn_t vid_bpp)
 {
 	struct efi_gop_obj *gopobj = container_of(this, struct efi_gop_obj, ops);
-	efi_uintn_t i, j, linelen;
+	efi_uintn_t i, j, linelen, slineoff = 0, dlineoff, swidth, dwidth;
 	u32 *fb32 = gopobj->fb;
 	u16 *fb16 = gopobj->fb;
+	struct efi_gop_pixel *buffer = __builtin_assume_aligned(bufferp, 4);
 
 	if (delta) {
 		/* Check for 4 byte alignment */
@@ -133,6 +135,37 @@  static __always_inline efi_status_t gop_blt_int(struct efi_gop *this,
 		break;
 	}
 
+	/* Calculate line width */
+	switch (operation) {
+	case EFI_BLT_BUFFER_TO_VIDEO:
+		swidth = linelen;
+		break;
+	case EFI_BLT_VIDEO_TO_BLT_BUFFER:
+	case EFI_BLT_VIDEO_TO_VIDEO:
+		swidth = gopobj->info.width;
+		if (!vid_bpp)
+			return EFI_UNSUPPORTED;
+		break;
+	case EFI_BLT_VIDEO_FILL:
+		swidth = 0;
+		break;
+	}
+
+	switch (operation) {
+	case EFI_BLT_BUFFER_TO_VIDEO:
+	case EFI_BLT_VIDEO_FILL:
+	case EFI_BLT_VIDEO_TO_VIDEO:
+		dwidth = gopobj->info.width;
+		if (!vid_bpp)
+			return EFI_UNSUPPORTED;
+		break;
+	case EFI_BLT_VIDEO_TO_BLT_BUFFER:
+		dwidth = linelen;
+		break;
+	}
+
+	slineoff = swidth * sy;
+	dlineoff = dwidth * dy;
 	for (i = 0; i < height; i++) {
 		for (j = 0; j < width; j++) {
 			struct efi_gop_pixel pix;
@@ -143,70 +176,65 @@  static __always_inline efi_status_t gop_blt_int(struct efi_gop *this,
 				pix = *buffer;
 				break;
 			case EFI_BLT_BUFFER_TO_VIDEO:
-				pix = buffer[linelen * (i + sy) + j + sx];
+				pix = buffer[slineoff + j + sx];
 				break;
 			case EFI_BLT_VIDEO_TO_BLT_BUFFER:
 			case EFI_BLT_VIDEO_TO_VIDEO:
-				switch (gopobj->bpix) {
-#ifdef CONFIG_DM_VIDEO
-				case VIDEO_BPP32:
-#else
-				case LCD_COLOR32:
-#endif
+				if (vid_bpp == 32)
 					pix = *(struct efi_gop_pixel *)&fb32[
-						gopobj->info.width *
-						(i + sy) + j + sx];
-				break;
-#ifdef CONFIG_DM_VIDEO
-				case VIDEO_BPP16:
-#else
-				case LCD_COLOR16:
-#endif
+						slineoff + j + sx];
+				else
 					pix = efi_vid16_to_blt_col(fb16[
-						gopobj->info.width *
-						(i + sy) + j + sx]);
-					break;
-				default:
-					return EFI_UNSUPPORTED;
-				}
+						slineoff + j + sx]);
 				break;
 			}
 
 			/* Write destination pixel */
 			switch (operation) {
 			case EFI_BLT_VIDEO_TO_BLT_BUFFER:
-				buffer[linelen * (i + dy) + j + dx] = pix;
+				buffer[dlineoff + j + dx] = pix;
 				break;
 			case EFI_BLT_BUFFER_TO_VIDEO:
 			case EFI_BLT_VIDEO_FILL:
 			case EFI_BLT_VIDEO_TO_VIDEO:
-				switch (gopobj->bpix) {
+				if (vid_bpp == 32)
+					fb32[dlineoff + j + dx] = *(u32 *)&pix;
+				else
+					fb16[dlineoff + j + dx] =
+						efi_blt_col_to_vid16(&pix);
+				break;
+			}
+		}
+		slineoff += swidth;
+		dlineoff += dwidth;
+	}
+
+	return EFI_SUCCESS;
+}
+
+static efi_uintn_t gop_get_bpp(struct efi_gop *this)
+{
+	struct efi_gop_obj *gopobj = container_of(this, struct efi_gop_obj, ops);
+	efi_uintn_t vid_bpp = 0;
+
+	switch (gopobj->bpix) {
 #ifdef CONFIG_DM_VIDEO
-				case VIDEO_BPP32:
+	case VIDEO_BPP32:
 #else
-				case LCD_COLOR32:
+	case LCD_COLOR32:
 #endif
-					fb32[gopobj->info.width *
-					     (i + dy) + j + dx] = *(u32 *)&pix;
-					break;
+		vid_bpp = 32;
+		break;
 #ifdef CONFIG_DM_VIDEO
-				case VIDEO_BPP16:
+	case VIDEO_BPP16:
 #else
-				case LCD_COLOR16:
+	case LCD_COLOR16:
 #endif
-					fb16[gopobj->info.width *
-					     (i + dy) + j + dx] =
-						efi_blt_col_to_vid16(&pix);
-					break;
-				default:
-					return EFI_UNSUPPORTED;
-				}
-				break;
-			}
-		}
+		vid_bpp = 16;
+		break;
 	}
 
-	return EFI_SUCCESS;
+	return vid_bpp;
 }
 
 /*
@@ -223,21 +251,33 @@  static efi_status_t gop_blt_video_fill(struct efi_gop *this,
 				       u32 foo, efi_uintn_t sx,
 				       efi_uintn_t sy, efi_uintn_t dx,
 				       efi_uintn_t dy, efi_uintn_t width,
-				       efi_uintn_t height, efi_uintn_t delta)
+				       efi_uintn_t height, efi_uintn_t delta,
+				       efi_uintn_t vid_bpp)
 {
 	return gop_blt_int(this, buffer, EFI_BLT_VIDEO_FILL, sx, sy, dx,
-			   dy, width, height, delta);
+			   dy, width, height, delta, vid_bpp);
 }
 
-static efi_status_t gop_blt_buf_to_vid(struct efi_gop *this,
-				       struct efi_gop_pixel *buffer,
-				       u32 foo, efi_uintn_t sx,
-				       efi_uintn_t sy, efi_uintn_t dx,
-				       efi_uintn_t dy, efi_uintn_t width,
-				       efi_uintn_t height, efi_uintn_t delta)
+static efi_status_t gop_blt_buf_to_vid16(struct efi_gop *this,
+					 struct efi_gop_pixel *buffer,
+					 u32 foo, efi_uintn_t sx,
+					 efi_uintn_t sy, efi_uintn_t dx,
+					 efi_uintn_t dy, efi_uintn_t width,
+					 efi_uintn_t height, efi_uintn_t delta)
 {
 	return gop_blt_int(this, buffer, EFI_BLT_BUFFER_TO_VIDEO, sx, sy, dx,
-			   dy, width, height, delta);
+			   dy, width, height, delta, 16);
+}
+
+static efi_status_t gop_blt_buf_to_vid32(struct efi_gop *this,
+					 struct efi_gop_pixel *buffer,
+					 u32 foo, efi_uintn_t sx,
+					 efi_uintn_t sy, efi_uintn_t dx,
+					 efi_uintn_t dy, efi_uintn_t width,
+					 efi_uintn_t height, efi_uintn_t delta)
+{
+	return gop_blt_int(this, buffer, EFI_BLT_BUFFER_TO_VIDEO, sx, sy, dx,
+			   dy, width, height, delta, 32);
 }
 
 static efi_status_t gop_blt_vid_to_vid(struct efi_gop *this,
@@ -245,10 +285,11 @@  static efi_status_t gop_blt_vid_to_vid(struct efi_gop *this,
 				       u32 foo, efi_uintn_t sx,
 				       efi_uintn_t sy, efi_uintn_t dx,
 				       efi_uintn_t dy, efi_uintn_t width,
-				       efi_uintn_t height, efi_uintn_t delta)
+				       efi_uintn_t height, efi_uintn_t delta,
+				       efi_uintn_t vid_bpp)
 {
 	return gop_blt_int(this, buffer, EFI_BLT_VIDEO_TO_VIDEO, sx, sy, dx,
-			   dy, width, height, delta);
+			   dy, width, height, delta, vid_bpp);
 }
 
 static efi_status_t gop_blt_vid_to_buf(struct efi_gop *this,
@@ -256,10 +297,11 @@  static efi_status_t gop_blt_vid_to_buf(struct efi_gop *this,
 				       u32 foo, efi_uintn_t sx,
 				       efi_uintn_t sy, efi_uintn_t dx,
 				       efi_uintn_t dy, efi_uintn_t width,
-				       efi_uintn_t height, efi_uintn_t delta)
+				       efi_uintn_t height, efi_uintn_t delta,
+				       efi_uintn_t vid_bpp)
 {
 	return gop_blt_int(this, buffer, EFI_BLT_VIDEO_TO_BLT_BUFFER, sx, sy,
-			   dx, dy, width, height, delta);
+			   dx, dy, width, height, delta, vid_bpp);
 }
 
 /*
@@ -287,27 +329,37 @@  efi_status_t EFIAPI gop_blt(struct efi_gop *this, struct efi_gop_pixel *buffer,
 			    efi_uintn_t height, efi_uintn_t delta)
 {
 	efi_status_t ret = EFI_INVALID_PARAMETER;
+	efi_uintn_t vid_bpp;
 
 	EFI_ENTRY("%p, %p, %u, %zu, %zu, %zu, %zu, %zu, %zu, %zu", this,
 		  buffer, operation, sx, sy, dx, dy, width, height, delta);
 
+	vid_bpp = gop_get_bpp(this);
+
 	/* Allow for compiler optimization */
 	switch (operation) {
 	case EFI_BLT_VIDEO_FILL:
 		ret = gop_blt_video_fill(this, buffer, operation, sx, sy, dx,
-					 dy, width, height, delta);
+					 dy, width, height, delta, vid_bpp);
 		break;
 	case EFI_BLT_BUFFER_TO_VIDEO:
-		ret = gop_blt_buf_to_vid(this, buffer, operation, sx, sy, dx,
-					 dy, width, height, delta);
+		/* This needs to be super-fast, so duplicate for 16/32bpp */
+		if (vid_bpp == 32)
+			ret = gop_blt_buf_to_vid32(this, buffer, operation, sx,
+						   sy, dx, dy, width, height,
+						   delta);
+		else
+			ret = gop_blt_buf_to_vid16(this, buffer, operation, sx,
+						   sy, dx, dy, width, height,
+						   delta);
 		break;
 	case EFI_BLT_VIDEO_TO_VIDEO:
 		ret = gop_blt_vid_to_vid(this, buffer, operation, sx, sy, dx,
-					 dy, width, height, delta);
+					 dy, width, height, delta, vid_bpp);
 		break;
 	case EFI_BLT_VIDEO_TO_BLT_BUFFER:
 		ret = gop_blt_vid_to_buf(this, buffer, operation, sx, sy, dx,
-					 dy, width, height, delta);
+					 dy, width, height, delta, vid_bpp);
 		break;
 	default:
 		ret = EFI_UNSUPPORTED;