diff mbox series

[v2,30/34] drm/bridge: imx8qxp-pixel-combiner: convert to devm_drm_bridge_alloc() API

Message ID 20250424-drm-bridge-convert-to-alloc-api-v2-30-8f91a404d86b@bootlin.com
State New
Headers show
Series drm: convert all bridges to devm_drm_bridge_alloc() | expand

Commit Message

Luca Ceresoli April 24, 2025, 6:59 p.m. UTC
This is the new API for allocating DRM bridges.

This driver embeds an array of channels in the main struct, and each
channel embeds a drm_bridge. This prevents dynamic, refcount-based
deallocation of the bridges.

To make the new, dynamic bridge allocation possible:

 * change the array of channels into an array of channel pointers
 * allocate each channel using devm_drm_bridge_alloc()
 * adapt the code wherever using the channels

Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>

---

Cc: Liu Ying <victor.liu@nxp.com>
---
 drivers/gpu/drm/bridge/imx/imx8qxp-pixel-combiner.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

Comments

Liu Ying April 29, 2025, 2:10 a.m. UTC | #1
Hi,

On 04/25/2025, Luca Ceresoli wrote:
> This is the new API for allocating DRM bridges.
> 
> This driver embeds an array of channels in the main struct, and each
> channel embeds a drm_bridge. This prevents dynamic, refcount-based
> deallocation of the bridges.
> 
> To make the new, dynamic bridge allocation possible:
> 
>  * change the array of channels into an array of channel pointers
>  * allocate each channel using devm_drm_bridge_alloc()
>  * adapt the code wherever using the channels
> 
> Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>
> 
> ---
> 
> Cc: Liu Ying <victor.liu@nxp.com>
> ---
>  drivers/gpu/drm/bridge/imx/imx8qxp-pixel-combiner.c | 18 ++++++++++++------
>  1 file changed, 12 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/bridge/imx/imx8qxp-pixel-combiner.c b/drivers/gpu/drm/bridge/imx/imx8qxp-pixel-combiner.c
> index 1f6fd488e7039e943351006d3373009f0c15cb08..40a8a5a53a781137e722309ff91692cf90d881da 100644
> --- a/drivers/gpu/drm/bridge/imx/imx8qxp-pixel-combiner.c
> +++ b/drivers/gpu/drm/bridge/imx/imx8qxp-pixel-combiner.c
> @@ -68,7 +68,7 @@ struct imx8qxp_pc_channel {
>  
>  struct imx8qxp_pc {
>  	struct device *dev;
> -	struct imx8qxp_pc_channel ch[2];
> +	struct imx8qxp_pc_channel *ch[2];
>  	struct clk *clk_apb;
>  	void __iomem *base;
>  };
> @@ -307,7 +307,14 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
>  			goto free_child;
>  		}
>  
> -		ch = &pc->ch[i];
> +		ch = devm_drm_bridge_alloc(dev, struct imx8qxp_pc_channel, bridge,
> +					   &imx8qxp_pc_bridge_funcs);
> +		if (IS_ERR(ch)) {
> +			ret = PTR_ERR(ch);
> +			goto free_child;
> +		}
> +
> +		pc->ch[i] = ch;
>  		ch->pc = pc;
>  		ch->stream_id = i;
>  
> @@ -333,7 +340,6 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
>  		of_node_put(remote);
>  
>  		ch->bridge.driver_private = ch;
> -		ch->bridge.funcs = &imx8qxp_pc_bridge_funcs;
>  		ch->bridge.of_node = child;
>  		ch->is_available = true;
>  
> @@ -345,8 +351,8 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
>  free_child:
>  	of_node_put(child);
>  
> -	if (i == 1 && pc->ch[0].next_bridge)
> -		drm_bridge_remove(&pc->ch[0].bridge);
> +	if (i == 1 && pc->ch[0]->next_bridge)

Since this patch makes pc->ch[0] and pc->ch[1] be allocated separately,
pc->ch[0] could be NULL if channel0 is not available, hence a NULL pointer
dereference here...

> +		drm_bridge_remove(&pc->ch[0]->bridge);
>  
>  	pm_runtime_disable(dev);
>  	return ret;
> @@ -359,7 +365,7 @@ static void imx8qxp_pc_bridge_remove(struct platform_device *pdev)
>  	int i;
>  
>  	for (i = 0; i < 2; i++) {
> -		ch = &pc->ch[i];
> +		ch = pc->ch[i];
>  
>  		if (!ch->is_available)

...and here too.

This is what I get when removing the imx8qxp_pixel_combiner module.
-8<-
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000144
Mem abort info:
  ESR = 0x0000000096000004
  EC = 0x25: DABT (current EL), IL = 32 bits
  SET = 0, FnV = 0
  EA = 0, S1PTW = 0
  FSC = 0x04: level 0 translation fault
Data abort info:
  ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
  CM = 0, WnR = 0, TnD = 0, TagAccess = 0
  GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
user pgtable: 4k pages, 48-bit VAs, pgdp=000000089e613000
[0000000000000144] pgd=0000000000000000, p4d=0000000000000000
Internal error: Oops: 0000000096000004 [#1]  SMP
Modules linked in: mpl3115 isl29018 industrialio_triggered_buffer kfifo_buf cdns3 cdns_usb_common snd_soc_imx_audmix rtc_imx_sc imx_sc_wdt imx_sc_thermal imx_sc_key imx8qxp_pixel_link6
CPU: 1 UID: 0 PID: 528 Comm: modprobe Not tainted 6.15.0-rc3-next-20250424-00059-gee51752c256e #217 PREEMPT 
Hardware name: Freescale i.MX8QXP MEK (DT)
pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : imx8qxp_pc_bridge_remove+0x38/0x6c [imx8qxp_pixel_combiner]
lr : imx8qxp_pc_bridge_remove+0x30/0x6c [imx8qxp_pixel_combiner]
sp : ffff8000840f3c40
x29: ffff8000840f3c40 x28: ffff00081e75d780 x27: 0000000000000000
x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000000
x23: ffff0008100fc090 x22: ffff0008100fe490 x21: ffff00081e69de00
x20: 0000000000000000 x19: ffff0008100fe410 x18: 0000000000000000
x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
x14: 0000000000000013 x13: ffff000810049010 x12: 0000000000000000
x11: ffff0008182bc550 x10: ffff0008182bc490 x9 : ffff000810049010
x8 : 0101010101010101 x7 : 7f7f7f7f7f7f7f7f x6 : fefefeff6364626d
x5 : 8080808000000000 x4 : 0000000000000000 x3 : 0000000000000000
x2 : ffff00081e75d780 x1 : ffff00081e75d780 x0 : ffff80007af9bdc0
Call trace:
 imx8qxp_pc_bridge_remove+0x38/0x6c [imx8qxp_pixel_combiner] (P)
 platform_remove+0x28/0x44
 device_remove+0x4c/0x80
 device_release_driver_internal+0x1c8/0x224
 driver_detach+0x50/0x98
 bus_remove_driver+0x6c/0xbc
 driver_unregister+0x30/0x60
 platform_driver_unregister+0x14/0x20
 imx8qxp_pc_bridge_driver_exit+0x18/0x814 [imx8qxp_pixel_combiner]
 __arm64_sys_delete_module+0x184/0x264
 invoke_syscall+0x48/0x110
 el0_svc_common.constprop.0+0xc8/0xe8
 do_el0_svc+0x20/0x2c
 el0_svc+0x30/0xd0
 el0t_64_sync_handler+0x144/0x168
 el0t_64_sync+0x198/0x19c
Code: aa1503e0 97f547f8 390512bf f9400a94 (39451280) 
---[ end trace 0000000000000000 ]---
-8<-

On top of this patch series, this issue doesn't happen if I apply the below
change:
diff --git a/drivers/gpu/drm/bridge/imx/imx8qxp-pixel-combiner.c b/drivers/gpu/drm/bridge/imx/imx8qxp-pixel-combiner.c
index 40a8a5a53a78..2eb0ade65d89 100644
--- a/drivers/gpu/drm/bridge/imx/imx8qxp-pixel-combiner.c
+++ b/drivers/gpu/drm/bridge/imx/imx8qxp-pixel-combiner.c
@@ -63,7 +63,6 @@ struct imx8qxp_pc_channel {
        struct drm_bridge *next_bridge;
        struct imx8qxp_pc *pc;
        unsigned int stream_id;
-       bool is_available;
 };
 
 struct imx8qxp_pc {
@@ -341,7 +340,6 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
 
                ch->bridge.driver_private = ch;
                ch->bridge.of_node = child;
-               ch->is_available = true;
 
                drm_bridge_add(&ch->bridge);
        }
@@ -351,7 +349,7 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
 free_child:
        of_node_put(child);
 
-       if (i == 1 && pc->ch[0]->next_bridge)
+       if (i == 1 && pc->ch[0])
                drm_bridge_remove(&pc->ch[0]->bridge);
 
        pm_runtime_disable(dev);
@@ -367,11 +365,8 @@ static void imx8qxp_pc_bridge_remove(struct platform_device *pdev)
        for (i = 0; i < 2; i++) {
                ch = pc->ch[i];
 
-               if (!ch->is_available)
-                       continue;
-
-               drm_bridge_remove(&ch->bridge);
-               ch->is_available = false;
+               if (ch)
+                       drm_bridge_remove(&ch->bridge);
        }
 
        pm_runtime_disable(&pdev->dev);

>  			continue;
>
Luca Ceresoli April 30, 2025, 9:29 a.m. UTC | #2
Hello Liu,

On Tue, 29 Apr 2025 10:10:55 +0800
Liu Ying <victor.liu@nxp.com> wrote:

> Hi,
> 
> On 04/25/2025, Luca Ceresoli wrote:
> > This is the new API for allocating DRM bridges.
> > 
> > This driver embeds an array of channels in the main struct, and each
> > channel embeds a drm_bridge. This prevents dynamic, refcount-based
> > deallocation of the bridges.
> > 
> > To make the new, dynamic bridge allocation possible:
> > 
> >  * change the array of channels into an array of channel pointers
> >  * allocate each channel using devm_drm_bridge_alloc()
> >  * adapt the code wherever using the channels
> > 
> > Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>

[...]

> > @@ -345,8 +351,8 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
> >  free_child:
> >  	of_node_put(child);
> >  
> > -	if (i == 1 && pc->ch[0].next_bridge)
> > -		drm_bridge_remove(&pc->ch[0].bridge);
> > +	if (i == 1 && pc->ch[0]->next_bridge)  
> 
> Since this patch makes pc->ch[0] and pc->ch[1] be allocated separately,
> pc->ch[0] could be NULL if channel0 is not available, hence a NULL pointer
> dereference here...

See below for this.

> > +		drm_bridge_remove(&pc->ch[0]->bridge);
> >  
> >  	pm_runtime_disable(dev);
> >  	return ret;
> > @@ -359,7 +365,7 @@ static void imx8qxp_pc_bridge_remove(struct platform_device *pdev)
> >  	int i;
> >  
> >  	for (i = 0; i < 2; i++) {
> > -		ch = &pc->ch[i];
> > +		ch = pc->ch[i];
> >  
> >  		if (!ch->is_available)  
> 
> ...and here too.

This is indeed a bug, I should have checked the pointer for being
non-NULL.

Looking at that more closely, I think the is_available flag can be
entirely removed now. The allocation itself (ch != NULL) now is
equivalent. Do you think my reasoning is correct?

Ouch! After writing the previous paragraph I realized you proposed this
a few lines below! OK, removing is_available. :)

[...]

> On top of this patch series, this issue doesn't happen if I apply the below
> change:

[...]

> @@ -351,7 +349,7 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
>  free_child:
>         of_node_put(child);
>  
> -       if (i == 1 && pc->ch[0]->next_bridge)
> +       if (i == 1 && pc->ch[0])
>                 drm_bridge_remove(&pc->ch[0]->bridge);

Unrelated to this patch, but as I looked at it more in depth now, I'm
not sure this whole logic is robust, even in the original code.

The 'i == 1' check here seems to mean "if some error happened when
handling channel@1, that means channel@0 was successfully initialized,
so let's clean up channel 0".

However my understanding of the bindings is that device tree is allowed
to have the channel@1 node before the channel@0 node (or even channel@1
without channel@0, but that's less problematic here).

In such case (channel@1 before channel@0), this would happen:

 1. alloc and init ch[1], all OK
 2. alloc and init ch[0], an error happens
    (e.g. of_graph_get_remote_node() fails)

So we'd reach the free_child: label, and we should call
drm_bridge_remove() for ch[1]->bridge, but there's no code to do that.

To be robust in such a case, I think both channels need to be checked
independently, as the status of one does not imply the status of the
other. E.g.:

  for (i = 0; i < 2; i++)
      if (pc->ch[i] && pc->ch[i]->next_bridge)
          drm_bridge_remove(&pc->ch[i]->bridge);

(which is similar to what .remove() does after the changes discussed in
this thread, and which I have queued for v3)

What's your opinion? Do you think I missed anything?

Thanks for taking the time to dig into this!

Best regards,
Luca
Liu Ying May 6, 2025, 2:24 a.m. UTC | #3
On 04/30/2025, Luca Ceresoli wrote:
> Hello Liu,

Hi Luca,

> 
> On Tue, 29 Apr 2025 10:10:55 +0800
> Liu Ying <victor.liu@nxp.com> wrote:
> 
>> Hi,
>>
>> On 04/25/2025, Luca Ceresoli wrote:
>>> This is the new API for allocating DRM bridges.
>>>
>>> This driver embeds an array of channels in the main struct, and each
>>> channel embeds a drm_bridge. This prevents dynamic, refcount-based
>>> deallocation of the bridges.
>>>
>>> To make the new, dynamic bridge allocation possible:
>>>
>>>  * change the array of channels into an array of channel pointers
>>>  * allocate each channel using devm_drm_bridge_alloc()
>>>  * adapt the code wherever using the channels
>>>
>>> Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>
> 
> [...]
> 
>>> @@ -345,8 +351,8 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
>>>  free_child:
>>>  	of_node_put(child);
>>>  
>>> -	if (i == 1 && pc->ch[0].next_bridge)
>>> -		drm_bridge_remove(&pc->ch[0].bridge);
>>> +	if (i == 1 && pc->ch[0]->next_bridge)  
>>
>> Since this patch makes pc->ch[0] and pc->ch[1] be allocated separately,
>> pc->ch[0] could be NULL if channel0 is not available, hence a NULL pointer
>> dereference here...
> 
> See below for this.
> 
>>> +		drm_bridge_remove(&pc->ch[0]->bridge);
>>>  
>>>  	pm_runtime_disable(dev);
>>>  	return ret;
>>> @@ -359,7 +365,7 @@ static void imx8qxp_pc_bridge_remove(struct platform_device *pdev)
>>>  	int i;
>>>  
>>>  	for (i = 0; i < 2; i++) {
>>> -		ch = &pc->ch[i];
>>> +		ch = pc->ch[i];
>>>  
>>>  		if (!ch->is_available)  
>>
>> ...and here too.
> 
> This is indeed a bug, I should have checked the pointer for being
> non-NULL.
> 
> Looking at that more closely, I think the is_available flag can be
> entirely removed now. The allocation itself (ch != NULL) now is
> equivalent. Do you think my reasoning is correct?
> 
> Ouch! After writing the previous paragraph I realized you proposed this
> a few lines below! OK, removing is_available. :)
> 
> [...]
> 
>> On top of this patch series, this issue doesn't happen if I apply the below
>> change:
> 
> [...]
> 
>> @@ -351,7 +349,7 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
>>  free_child:
>>         of_node_put(child);
>>  
>> -       if (i == 1 && pc->ch[0]->next_bridge)
>> +       if (i == 1 && pc->ch[0])
>>                 drm_bridge_remove(&pc->ch[0]->bridge);
> 
> Unrelated to this patch, but as I looked at it more in depth now, I'm
> not sure this whole logic is robust, even in the original code.
> 
> The 'i == 1' check here seems to mean "if some error happened when
> handling channel@1, that means channel@0 was successfully initialized,
> so let's clean up channel 0".
> 
> However my understanding of the bindings is that device tree is allowed
> to have the channel@1 node before the channel@0 node (or even channel@1
> without channel@0, but that's less problematic here).
> 
> In such case (channel@1 before channel@0), this would happen:
> 
>  1. alloc and init ch[1], all OK
>  2. alloc and init ch[0], an error happens
>     (e.g. of_graph_get_remote_node() fails)
> 
> So we'd reach the free_child: label, and we should call
> drm_bridge_remove() for ch[1]->bridge, but there's no code to do that.
> 
> To be robust in such a case, I think both channels need to be checked
> independently, as the status of one does not imply the status of the
> other. E.g.:
> 
>   for (i = 0; i < 2; i++)
>       if (pc->ch[i] && pc->ch[i]->next_bridge)
>           drm_bridge_remove(&pc->ch[i]->bridge);
> 
> (which is similar to what .remove() does after the changes discussed in
> this thread, and which I have queued for v3)
> 
> What's your opinion? Do you think I missed anything?

The pixel combiner DT node would be added in imx8-ss-dc{0,1}.dtsi, please
see the case for imx8-ss-dc0.dtsi introduced by an in-flight patch[1].  As
channel@{0,1} child nodes always exist(DT overlay cannot effectively delete
any of them) and channel@0 always comes first, there is no problematic case.

> 
> Thanks for taking the time to dig into this!

After looking into this patch and patch 31(though I've already provided my A-b)
more closely, I think the imx8qxp_pc and imx8{qm,qxp}_ldb main structures
should have the same life time with the embedded DRM bridges, because for
example the clk_apb clock in struct imx8qxp_pc would be accessed by the
imx8qxp_pc_bridge_mode_set DRM bridge callback.  But, IIUC, your patches extend
the life time for the embedded channel/bridge structures only, but not for the
main structures.  What do you think ?

> 
> Best regards,
> Luca
> 

[1] https://lore.kernel.org/dri-devel/20250414035028.1561475-17-victor.liu@nxp.com/
Luca Ceresoli May 6, 2025, 8:47 p.m. UTC | #4
Hello Liu,

thanks for your further feedback.

On Tue, 6 May 2025 10:24:18 +0800
Liu Ying <victor.liu@nxp.com> wrote:

> On 04/30/2025, Luca Ceresoli wrote:
> > Hello Liu,  
> 
> Hi Luca,
> 
> > 
> > On Tue, 29 Apr 2025 10:10:55 +0800
> > Liu Ying <victor.liu@nxp.com> wrote:
> >   
> >> Hi,
> >>
> >> On 04/25/2025, Luca Ceresoli wrote:  
> >>> This is the new API for allocating DRM bridges.
> >>>
> >>> This driver embeds an array of channels in the main struct, and each
> >>> channel embeds a drm_bridge. This prevents dynamic, refcount-based
> >>> deallocation of the bridges.
> >>>
> >>> To make the new, dynamic bridge allocation possible:
> >>>
> >>>  * change the array of channels into an array of channel pointers
> >>>  * allocate each channel using devm_drm_bridge_alloc()
> >>>  * adapt the code wherever using the channels
> >>>
> >>> Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>  
> > 
> > [...]
> >   
> >>> @@ -345,8 +351,8 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
> >>>  free_child:
> >>>  	of_node_put(child);
> >>>  
> >>> -	if (i == 1 && pc->ch[0].next_bridge)
> >>> -		drm_bridge_remove(&pc->ch[0].bridge);
> >>> +	if (i == 1 && pc->ch[0]->next_bridge)    
> >>
> >> Since this patch makes pc->ch[0] and pc->ch[1] be allocated separately,
> >> pc->ch[0] could be NULL if channel0 is not available, hence a NULL pointer
> >> dereference here...  
> > 
> > See below for this.
> >   
> >>> +		drm_bridge_remove(&pc->ch[0]->bridge);
> >>>  
> >>>  	pm_runtime_disable(dev);
> >>>  	return ret;
> >>> @@ -359,7 +365,7 @@ static void imx8qxp_pc_bridge_remove(struct platform_device *pdev)
> >>>  	int i;
> >>>  
> >>>  	for (i = 0; i < 2; i++) {
> >>> -		ch = &pc->ch[i];
> >>> +		ch = pc->ch[i];
> >>>  
> >>>  		if (!ch->is_available)    
> >>
> >> ...and here too.  
> > 
> > This is indeed a bug, I should have checked the pointer for being
> > non-NULL.
> > 
> > Looking at that more closely, I think the is_available flag can be
> > entirely removed now. The allocation itself (ch != NULL) now is
> > equivalent. Do you think my reasoning is correct?
> > 
> > Ouch! After writing the previous paragraph I realized you proposed this
> > a few lines below! OK, removing is_available. :)
> > 
> > [...]
> >   
> >> On top of this patch series, this issue doesn't happen if I apply the below
> >> change:  
> > 
> > [...]
> >   
> >> @@ -351,7 +349,7 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
> >>  free_child:
> >>         of_node_put(child);
> >>  
> >> -       if (i == 1 && pc->ch[0]->next_bridge)
> >> +       if (i == 1 && pc->ch[0])
> >>                 drm_bridge_remove(&pc->ch[0]->bridge);  
> > 
> > Unrelated to this patch, but as I looked at it more in depth now, I'm
> > not sure this whole logic is robust, even in the original code.
> > 
> > The 'i == 1' check here seems to mean "if some error happened when
> > handling channel@1, that means channel@0 was successfully initialized,
> > so let's clean up channel 0".
> > 
> > However my understanding of the bindings is that device tree is allowed
> > to have the channel@1 node before the channel@0 node (or even channel@1
> > without channel@0, but that's less problematic here).
> > 
> > In such case (channel@1 before channel@0), this would happen:
> > 
> >  1. alloc and init ch[1], all OK
> >  2. alloc and init ch[0], an error happens
> >     (e.g. of_graph_get_remote_node() fails)
> > 
> > So we'd reach the free_child: label, and we should call
> > drm_bridge_remove() for ch[1]->bridge, but there's no code to do that.
> > 
> > To be robust in such a case, I think both channels need to be checked
> > independently, as the status of one does not imply the status of the
> > other. E.g.:
> > 
> >   for (i = 0; i < 2; i++)
> >       if (pc->ch[i] && pc->ch[i]->next_bridge)
> >           drm_bridge_remove(&pc->ch[i]->bridge);
> > 
> > (which is similar to what .remove() does after the changes discussed in
> > this thread, and which I have queued for v3)
> > 
> > What's your opinion? Do you think I missed anything?  
> 
> The pixel combiner DT node would be added in imx8-ss-dc{0,1}.dtsi, please
> see the case for imx8-ss-dc0.dtsi introduced by an in-flight patch[1].  As
> channel@{0,1} child nodes always exist(DT overlay cannot effectively delete
> any of them) and channel@0 always comes first, there is no problematic case.

I'm not questioning what existing and future dts files (will) contain,
and surely I don't see a good reason someone would write channel@1
before channel@0.

My point is:

 - the bindings _allow_ channel1 before channel@0
 - the error management code after the free_child label won't work
   correctly if channel1 is before channel@0 in the device tree

IOW the driver is not robust against all legal device tree descriptions,
and it could be easily made robust using the example code in my
previous e-mail (quoted a few lines above).

If you agree about this I'll be happy to send a patch doing that change.
If you think I'm wrong, I won't fight a battle. This topic is
orthogonal to the change I'm introducing in this patch, and I can
continue the conversion independently from this discussion.

> > Thanks for taking the time to dig into this!  
> 
> After looking into this patch and patch 31(though I've already provided my A-b)
> more closely, I think the imx8qxp_pc and imx8{qm,qxp}_ldb main structures
> should have the same life time with the embedded DRM bridges, because for
> example the clk_apb clock in struct imx8qxp_pc would be accessed by the
> imx8qxp_pc_bridge_mode_set DRM bridge callback.  But, IIUC, your patches extend
> the life time for the embedded channel/bridge structures only, but not for the
> main structures.  What do you think ?

I see you concern, but I'm sure the change I'm introducing is not
creating the problem you are concerned about.

The key aspect is that my patch is merely changing the lifetime of the
_allocation_ of the drm_bridge, not its usage. On drm_bridge_remove()
the bridge is removed from its encoder chain and it is completely not
reachable, both before and after my patch. With my patch it is not
freed immediately, but it's just a piece of "wasted" memory that is
still allocated until elsewhere in the kernel there are pointers to it,
to avoid use-after-free.

With this explanation, do you think my patch is correct (after fixing
the bug we already discussed of course)?

Best regards,
Luca
Liu Ying May 7, 2025, 2:10 a.m. UTC | #5
On 05/07/2025, Luca Ceresoli wrote:
> Hello Liu,

Hi Luca,

> 
> thanks for your further feedback.
> 
> On Tue, 6 May 2025 10:24:18 +0800
> Liu Ying <victor.liu@nxp.com> wrote:
> 
>> On 04/30/2025, Luca Ceresoli wrote:
>>> Hello Liu,  
>>
>> Hi Luca,
>>
>>>
>>> On Tue, 29 Apr 2025 10:10:55 +0800
>>> Liu Ying <victor.liu@nxp.com> wrote:
>>>   
>>>> Hi,
>>>>
>>>> On 04/25/2025, Luca Ceresoli wrote:  
>>>>> This is the new API for allocating DRM bridges.
>>>>>
>>>>> This driver embeds an array of channels in the main struct, and each
>>>>> channel embeds a drm_bridge. This prevents dynamic, refcount-based
>>>>> deallocation of the bridges.
>>>>>
>>>>> To make the new, dynamic bridge allocation possible:
>>>>>
>>>>>  * change the array of channels into an array of channel pointers
>>>>>  * allocate each channel using devm_drm_bridge_alloc()
>>>>>  * adapt the code wherever using the channels
>>>>>
>>>>> Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>  
>>>
>>> [...]
>>>   
>>>>> @@ -345,8 +351,8 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
>>>>>  free_child:
>>>>>  	of_node_put(child);
>>>>>  
>>>>> -	if (i == 1 && pc->ch[0].next_bridge)
>>>>> -		drm_bridge_remove(&pc->ch[0].bridge);
>>>>> +	if (i == 1 && pc->ch[0]->next_bridge)    
>>>>
>>>> Since this patch makes pc->ch[0] and pc->ch[1] be allocated separately,
>>>> pc->ch[0] could be NULL if channel0 is not available, hence a NULL pointer
>>>> dereference here...  
>>>
>>> See below for this.
>>>   
>>>>> +		drm_bridge_remove(&pc->ch[0]->bridge);
>>>>>  
>>>>>  	pm_runtime_disable(dev);
>>>>>  	return ret;
>>>>> @@ -359,7 +365,7 @@ static void imx8qxp_pc_bridge_remove(struct platform_device *pdev)
>>>>>  	int i;
>>>>>  
>>>>>  	for (i = 0; i < 2; i++) {
>>>>> -		ch = &pc->ch[i];
>>>>> +		ch = pc->ch[i];
>>>>>  
>>>>>  		if (!ch->is_available)    
>>>>
>>>> ...and here too.  
>>>
>>> This is indeed a bug, I should have checked the pointer for being
>>> non-NULL.
>>>
>>> Looking at that more closely, I think the is_available flag can be
>>> entirely removed now. The allocation itself (ch != NULL) now is
>>> equivalent. Do you think my reasoning is correct?
>>>
>>> Ouch! After writing the previous paragraph I realized you proposed this
>>> a few lines below! OK, removing is_available. :)
>>>
>>> [...]
>>>   
>>>> On top of this patch series, this issue doesn't happen if I apply the below
>>>> change:  
>>>
>>> [...]
>>>   
>>>> @@ -351,7 +349,7 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
>>>>  free_child:
>>>>         of_node_put(child);
>>>>  
>>>> -       if (i == 1 && pc->ch[0]->next_bridge)
>>>> +       if (i == 1 && pc->ch[0])
>>>>                 drm_bridge_remove(&pc->ch[0]->bridge);  
>>>
>>> Unrelated to this patch, but as I looked at it more in depth now, I'm
>>> not sure this whole logic is robust, even in the original code.
>>>
>>> The 'i == 1' check here seems to mean "if some error happened when
>>> handling channel@1, that means channel@0 was successfully initialized,
>>> so let's clean up channel 0".
>>>
>>> However my understanding of the bindings is that device tree is allowed
>>> to have the channel@1 node before the channel@0 node (or even channel@1
>>> without channel@0, but that's less problematic here).
>>>
>>> In such case (channel@1 before channel@0), this would happen:
>>>
>>>  1. alloc and init ch[1], all OK
>>>  2. alloc and init ch[0], an error happens
>>>     (e.g. of_graph_get_remote_node() fails)
>>>
>>> So we'd reach the free_child: label, and we should call
>>> drm_bridge_remove() for ch[1]->bridge, but there's no code to do that.
>>>
>>> To be robust in such a case, I think both channels need to be checked
>>> independently, as the status of one does not imply the status of the
>>> other. E.g.:
>>>
>>>   for (i = 0; i < 2; i++)
>>>       if (pc->ch[i] && pc->ch[i]->next_bridge)
>>>           drm_bridge_remove(&pc->ch[i]->bridge);
>>>
>>> (which is similar to what .remove() does after the changes discussed in
>>> this thread, and which I have queued for v3)
>>>
>>> What's your opinion? Do you think I missed anything?  
>>
>> The pixel combiner DT node would be added in imx8-ss-dc{0,1}.dtsi, please
>> see the case for imx8-ss-dc0.dtsi introduced by an in-flight patch[1].  As
>> channel@{0,1} child nodes always exist(DT overlay cannot effectively delete
>> any of them) and channel@0 always comes first, there is no problematic case.
> 
> I'm not questioning what existing and future dts files (will) contain,
> and surely I don't see a good reason someone would write channel@1
> before channel@0.
> 
> My point is:
> 
>  - the bindings _allow_ channel1 before channel@0
>  - the error management code after the free_child label won't work
>    correctly if channel1 is before channel@0 in the device tree
> 
> IOW the driver is not robust against all legal device tree descriptions,
> and it could be easily made robust using the example code in my
> previous e-mail (quoted a few lines above).
> 
> If you agree about this I'll be happy to send a patch doing that change.
> If you think I'm wrong, I won't fight a battle. This topic is
> orthogonal to the change I'm introducing in this patch, and I can
> continue the conversion independently from this discussion.

I don't think it is necessary to do that change for now.  When someone
really comes across this issue, we may make the error management code
robust.

> 
>>> Thanks for taking the time to dig into this!  
>>
>> After looking into this patch and patch 31(though I've already provided my A-b)
>> more closely, I think the imx8qxp_pc and imx8{qm,qxp}_ldb main structures
>> should have the same life time with the embedded DRM bridges, because for
>> example the clk_apb clock in struct imx8qxp_pc would be accessed by the
>> imx8qxp_pc_bridge_mode_set DRM bridge callback.  But, IIUC, your patches extend
>> the life time for the embedded channel/bridge structures only, but not for the
>> main structures.  What do you think ?
> 
> I see you concern, but I'm sure the change I'm introducing is not
> creating the problem you are concerned about.
> 
> The key aspect is that my patch is merely changing the lifetime of the
> _allocation_ of the drm_bridge, not its usage. On drm_bridge_remove()
> the bridge is removed from its encoder chain and it is completely not
> reachable, both before and after my patch. With my patch it is not
> freed immediately, but it's just a piece of "wasted" memory that is
> still allocated until elsewhere in the kernel there are pointers to it,
> to avoid use-after-free.
> 
> With this explanation, do you think my patch is correct (after fixing
> the bug we already discussed of course)?

I tend to say your patch is not correct because we'll eventually make sure
that removing a bridge module is safe when doing atomic commit, which means
the main structures should have the same life time with the DRM bridges.

> 
> Best regards,
> Luca
>
Luca Ceresoli May 7, 2025, 7:12 a.m. UTC | #6
Hello Liu,

On Wed, 7 May 2025 10:10:53 +0800
Liu Ying <victor.liu@nxp.com> wrote:

> On 05/07/2025, Luca Ceresoli wrote:
> > Hello Liu,  
> 
> Hi Luca,
> 
> > 
> > thanks for your further feedback.
> > 
> > On Tue, 6 May 2025 10:24:18 +0800
> > Liu Ying <victor.liu@nxp.com> wrote:
> >   
> >> On 04/30/2025, Luca Ceresoli wrote:  
> >>> Hello Liu,    
> >>
> >> Hi Luca,
> >>  
> >>>
> >>> On Tue, 29 Apr 2025 10:10:55 +0800
> >>> Liu Ying <victor.liu@nxp.com> wrote:
> >>>     
> >>>> Hi,
> >>>>
> >>>> On 04/25/2025, Luca Ceresoli wrote:    
> >>>>> This is the new API for allocating DRM bridges.
> >>>>>
> >>>>> This driver embeds an array of channels in the main struct, and each
> >>>>> channel embeds a drm_bridge. This prevents dynamic, refcount-based
> >>>>> deallocation of the bridges.
> >>>>>
> >>>>> To make the new, dynamic bridge allocation possible:
> >>>>>
> >>>>>  * change the array of channels into an array of channel pointers
> >>>>>  * allocate each channel using devm_drm_bridge_alloc()
> >>>>>  * adapt the code wherever using the channels
> >>>>>
> >>>>> Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>    
> >>>
> >>> [...]
> >>>     
> >>>>> @@ -345,8 +351,8 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
> >>>>>  free_child:
> >>>>>  	of_node_put(child);
> >>>>>  
> >>>>> -	if (i == 1 && pc->ch[0].next_bridge)
> >>>>> -		drm_bridge_remove(&pc->ch[0].bridge);
> >>>>> +	if (i == 1 && pc->ch[0]->next_bridge)      
> >>>>
> >>>> Since this patch makes pc->ch[0] and pc->ch[1] be allocated separately,
> >>>> pc->ch[0] could be NULL if channel0 is not available, hence a NULL pointer
> >>>> dereference here...    
> >>>
> >>> See below for this.
> >>>     
> >>>>> +		drm_bridge_remove(&pc->ch[0]->bridge);
> >>>>>  
> >>>>>  	pm_runtime_disable(dev);
> >>>>>  	return ret;
> >>>>> @@ -359,7 +365,7 @@ static void imx8qxp_pc_bridge_remove(struct platform_device *pdev)
> >>>>>  	int i;
> >>>>>  
> >>>>>  	for (i = 0; i < 2; i++) {
> >>>>> -		ch = &pc->ch[i];
> >>>>> +		ch = pc->ch[i];
> >>>>>  
> >>>>>  		if (!ch->is_available)      
> >>>>
> >>>> ...and here too.    
> >>>
> >>> This is indeed a bug, I should have checked the pointer for being
> >>> non-NULL.
> >>>
> >>> Looking at that more closely, I think the is_available flag can be
> >>> entirely removed now. The allocation itself (ch != NULL) now is
> >>> equivalent. Do you think my reasoning is correct?
> >>>
> >>> Ouch! After writing the previous paragraph I realized you proposed this
> >>> a few lines below! OK, removing is_available. :)
> >>>
> >>> [...]
> >>>     
> >>>> On top of this patch series, this issue doesn't happen if I apply the below
> >>>> change:    
> >>>
> >>> [...]
> >>>     
> >>>> @@ -351,7 +349,7 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
> >>>>  free_child:
> >>>>         of_node_put(child);
> >>>>  
> >>>> -       if (i == 1 && pc->ch[0]->next_bridge)
> >>>> +       if (i == 1 && pc->ch[0])
> >>>>                 drm_bridge_remove(&pc->ch[0]->bridge);    
> >>>
> >>> Unrelated to this patch, but as I looked at it more in depth now, I'm
> >>> not sure this whole logic is robust, even in the original code.
> >>>
> >>> The 'i == 1' check here seems to mean "if some error happened when
> >>> handling channel@1, that means channel@0 was successfully initialized,
> >>> so let's clean up channel 0".
> >>>
> >>> However my understanding of the bindings is that device tree is allowed
> >>> to have the channel@1 node before the channel@0 node (or even channel@1
> >>> without channel@0, but that's less problematic here).
> >>>
> >>> In such case (channel@1 before channel@0), this would happen:
> >>>
> >>>  1. alloc and init ch[1], all OK
> >>>  2. alloc and init ch[0], an error happens
> >>>     (e.g. of_graph_get_remote_node() fails)
> >>>
> >>> So we'd reach the free_child: label, and we should call
> >>> drm_bridge_remove() for ch[1]->bridge, but there's no code to do that.
> >>>
> >>> To be robust in such a case, I think both channels need to be checked
> >>> independently, as the status of one does not imply the status of the
> >>> other. E.g.:
> >>>
> >>>   for (i = 0; i < 2; i++)
> >>>       if (pc->ch[i] && pc->ch[i]->next_bridge)
> >>>           drm_bridge_remove(&pc->ch[i]->bridge);
> >>>
> >>> (which is similar to what .remove() does after the changes discussed in
> >>> this thread, and which I have queued for v3)
> >>>
> >>> What's your opinion? Do you think I missed anything?    
> >>
> >> The pixel combiner DT node would be added in imx8-ss-dc{0,1}.dtsi, please
> >> see the case for imx8-ss-dc0.dtsi introduced by an in-flight patch[1].  As
> >> channel@{0,1} child nodes always exist(DT overlay cannot effectively delete
> >> any of them) and channel@0 always comes first, there is no problematic case.  
> > 
> > I'm not questioning what existing and future dts files (will) contain,
> > and surely I don't see a good reason someone would write channel@1
> > before channel@0.
> > 
> > My point is:
> > 
> >  - the bindings _allow_ channel1 before channel@0
> >  - the error management code after the free_child label won't work
> >    correctly if channel1 is before channel@0 in the device tree
> > 
> > IOW the driver is not robust against all legal device tree descriptions,
> > and it could be easily made robust using the example code in my
> > previous e-mail (quoted a few lines above).
> > 
> > If you agree about this I'll be happy to send a patch doing that change.
> > If you think I'm wrong, I won't fight a battle. This topic is
> > orthogonal to the change I'm introducing in this patch, and I can
> > continue the conversion independently from this discussion.  
> 
> I don't think it is necessary to do that change for now.  When someone
> really comes across this issue, we may make the error management code
> robust.
> 
> >   
> >>> Thanks for taking the time to dig into this!    
> >>
> >> After looking into this patch and patch 31(though I've already provided my A-b)
> >> more closely, I think the imx8qxp_pc and imx8{qm,qxp}_ldb main structures
> >> should have the same life time with the embedded DRM bridges, because for
> >> example the clk_apb clock in struct imx8qxp_pc would be accessed by the
> >> imx8qxp_pc_bridge_mode_set DRM bridge callback.  But, IIUC, your patches extend
> >> the life time for the embedded channel/bridge structures only, but not for the
> >> main structures.  What do you think ?  
> > 
> > I see you concern, but I'm sure the change I'm introducing is not
> > creating the problem you are concerned about.
> > 
> > The key aspect is that my patch is merely changing the lifetime of the
> > _allocation_ of the drm_bridge, not its usage. On drm_bridge_remove()
> > the bridge is removed from its encoder chain and it is completely not
> > reachable, both before and after my patch. With my patch it is not
> > freed immediately, but it's just a piece of "wasted" memory that is
> > still allocated until elsewhere in the kernel there are pointers to it,
> > to avoid use-after-free.
> > 
> > With this explanation, do you think my patch is correct (after fixing
> > the bug we already discussed of course)?  
> 
> I tend to say your patch is not correct because we'll eventually make sure
> that removing a bridge module is safe when doing atomic commit,

I think your sentence can be rephrased as "your patch is correct with
the current code base where bridges are not (yet) removable, but there
will be a problem when they start to actually be removable".

Is my understanding correct? If it is, I agree on that sentence.

The work to have removable bridges is massive and non-trivial, so it
will need to be tackled in steps. The grand plan [0] is:

 1. add refcounting to DRM bridges (struct drm_bridge)
 2. handle gracefully atomic updates during bridge removal
 3. avoid DSI host drivers to have dangling pointers to DSI devices 
 4. finish the hotplug bridge work, removing the "always-disconnected"
    connector, moving code to the core and potentially removing the
    hotplug-bridge itself (this needs to be clarified as points 1-3 are
    developed)

I am at step 1 right now. Removal during atomic updates is step 2,
ideas about how to implement that are already being discussed [1],
there's a practical plan proposed by Maxime with the goal of reaching
removable bridges without breaking things along the path.

[0] https://lore.kernel.org/lkml/20250206-hotplug-drm-bridge-v6-0-9d6f2c9c3058@bootlin.com/
[1] https://lore.kernel.org/all/20250106-vigorous-talented-viper-fa49d9@houat/

> which means
> the main structures should have the same life time with the DRM bridges.

The word "lifetime" mean two things for bridges:

 * the time span during which memory is allocated for a struct
   drm_bridge (along with the embedding struct)
 * the time span during which a DRM bridge is active/used/usable as
   part of a card
   - i.e. when it is part of an encoder chain
   - i.e. when drm_bridge_funcs callbacks can be called
   - i.e. from drm_bridge_add() to drm_bridge_remove()

These two lifetimes used to be nearly the same. Now the "memory
allocation lifetime" is extended, but the "bridge existence" is
unchanged: drm_bridge_add() to drm_bridge_remove() are called in the
same place and do the same things, so the bridge will stop being in any
encoder chain at the exact same time. now we are just keeping a piece of
memory allocated for a longer time.

Seen in another way, the events used to be:

 * probe:
   - allocate bridge
   - drm_bridge_add()

 * remove
   - drm_bridge_remove()
   - now the bridge is not used, it's just some dead memory [*]
   - kfree bridge (either in .remove() or just after by devm)

Now it becomes:

 * probe:
   - allocate bridge
   - drm_bridge_add()

 * remove
   - drm_bridge_remove()
   - now the bridge is not used, it's just some dead memory [*]
   - maybe some more time, possibly long, until the last put [*]
   - kfree bridge (by devm)

The duration of the [*] steps changes, but it's harmless because the
bridge is not used at all. No change except for memory allocation.

Luca
Liu Ying May 7, 2025, 10:16 a.m. UTC | #7
On 05/07/2025, Luca Ceresoli wrote:
> Hello Liu,
> 
> On Wed, 7 May 2025 10:10:53 +0800
> Liu Ying <victor.liu@nxp.com> wrote:
> 
>> On 05/07/2025, Luca Ceresoli wrote:
>>> Hello Liu,  
>>
>> Hi Luca,
>>
>>>
>>> thanks for your further feedback.
>>>
>>> On Tue, 6 May 2025 10:24:18 +0800
>>> Liu Ying <victor.liu@nxp.com> wrote:
>>>   
>>>> On 04/30/2025, Luca Ceresoli wrote:  
>>>>> Hello Liu,    
>>>>
>>>> Hi Luca,
>>>>  
>>>>>
>>>>> On Tue, 29 Apr 2025 10:10:55 +0800
>>>>> Liu Ying <victor.liu@nxp.com> wrote:
>>>>>     
>>>>>> Hi,
>>>>>>
>>>>>> On 04/25/2025, Luca Ceresoli wrote:    
>>>>>>> This is the new API for allocating DRM bridges.
>>>>>>>
>>>>>>> This driver embeds an array of channels in the main struct, and each
>>>>>>> channel embeds a drm_bridge. This prevents dynamic, refcount-based
>>>>>>> deallocation of the bridges.
>>>>>>>
>>>>>>> To make the new, dynamic bridge allocation possible:
>>>>>>>
>>>>>>>  * change the array of channels into an array of channel pointers
>>>>>>>  * allocate each channel using devm_drm_bridge_alloc()
>>>>>>>  * adapt the code wherever using the channels
>>>>>>>
>>>>>>> Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>    
>>>>>
>>>>> [...]
>>>>>     
>>>>>>> @@ -345,8 +351,8 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
>>>>>>>  free_child:
>>>>>>>  	of_node_put(child);
>>>>>>>  
>>>>>>> -	if (i == 1 && pc->ch[0].next_bridge)
>>>>>>> -		drm_bridge_remove(&pc->ch[0].bridge);
>>>>>>> +	if (i == 1 && pc->ch[0]->next_bridge)      
>>>>>>
>>>>>> Since this patch makes pc->ch[0] and pc->ch[1] be allocated separately,
>>>>>> pc->ch[0] could be NULL if channel0 is not available, hence a NULL pointer
>>>>>> dereference here...    
>>>>>
>>>>> See below for this.
>>>>>     
>>>>>>> +		drm_bridge_remove(&pc->ch[0]->bridge);
>>>>>>>  
>>>>>>>  	pm_runtime_disable(dev);
>>>>>>>  	return ret;
>>>>>>> @@ -359,7 +365,7 @@ static void imx8qxp_pc_bridge_remove(struct platform_device *pdev)
>>>>>>>  	int i;
>>>>>>>  
>>>>>>>  	for (i = 0; i < 2; i++) {
>>>>>>> -		ch = &pc->ch[i];
>>>>>>> +		ch = pc->ch[i];
>>>>>>>  
>>>>>>>  		if (!ch->is_available)      
>>>>>>
>>>>>> ...and here too.    
>>>>>
>>>>> This is indeed a bug, I should have checked the pointer for being
>>>>> non-NULL.
>>>>>
>>>>> Looking at that more closely, I think the is_available flag can be
>>>>> entirely removed now. The allocation itself (ch != NULL) now is
>>>>> equivalent. Do you think my reasoning is correct?
>>>>>
>>>>> Ouch! After writing the previous paragraph I realized you proposed this
>>>>> a few lines below! OK, removing is_available. :)
>>>>>
>>>>> [...]
>>>>>     
>>>>>> On top of this patch series, this issue doesn't happen if I apply the below
>>>>>> change:    
>>>>>
>>>>> [...]
>>>>>     
>>>>>> @@ -351,7 +349,7 @@ static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
>>>>>>  free_child:
>>>>>>         of_node_put(child);
>>>>>>  
>>>>>> -       if (i == 1 && pc->ch[0]->next_bridge)
>>>>>> +       if (i == 1 && pc->ch[0])
>>>>>>                 drm_bridge_remove(&pc->ch[0]->bridge);    
>>>>>
>>>>> Unrelated to this patch, but as I looked at it more in depth now, I'm
>>>>> not sure this whole logic is robust, even in the original code.
>>>>>
>>>>> The 'i == 1' check here seems to mean "if some error happened when
>>>>> handling channel@1, that means channel@0 was successfully initialized,
>>>>> so let's clean up channel 0".
>>>>>
>>>>> However my understanding of the bindings is that device tree is allowed
>>>>> to have the channel@1 node before the channel@0 node (or even channel@1
>>>>> without channel@0, but that's less problematic here).
>>>>>
>>>>> In such case (channel@1 before channel@0), this would happen:
>>>>>
>>>>>  1. alloc and init ch[1], all OK
>>>>>  2. alloc and init ch[0], an error happens
>>>>>     (e.g. of_graph_get_remote_node() fails)
>>>>>
>>>>> So we'd reach the free_child: label, and we should call
>>>>> drm_bridge_remove() for ch[1]->bridge, but there's no code to do that.
>>>>>
>>>>> To be robust in such a case, I think both channels need to be checked
>>>>> independently, as the status of one does not imply the status of the
>>>>> other. E.g.:
>>>>>
>>>>>   for (i = 0; i < 2; i++)
>>>>>       if (pc->ch[i] && pc->ch[i]->next_bridge)
>>>>>           drm_bridge_remove(&pc->ch[i]->bridge);
>>>>>
>>>>> (which is similar to what .remove() does after the changes discussed in
>>>>> this thread, and which I have queued for v3)
>>>>>
>>>>> What's your opinion? Do you think I missed anything?    
>>>>
>>>> The pixel combiner DT node would be added in imx8-ss-dc{0,1}.dtsi, please
>>>> see the case for imx8-ss-dc0.dtsi introduced by an in-flight patch[1].  As
>>>> channel@{0,1} child nodes always exist(DT overlay cannot effectively delete
>>>> any of them) and channel@0 always comes first, there is no problematic case.  
>>>
>>> I'm not questioning what existing and future dts files (will) contain,
>>> and surely I don't see a good reason someone would write channel@1
>>> before channel@0.
>>>
>>> My point is:
>>>
>>>  - the bindings _allow_ channel1 before channel@0
>>>  - the error management code after the free_child label won't work
>>>    correctly if channel1 is before channel@0 in the device tree
>>>
>>> IOW the driver is not robust against all legal device tree descriptions,
>>> and it could be easily made robust using the example code in my
>>> previous e-mail (quoted a few lines above).
>>>
>>> If you agree about this I'll be happy to send a patch doing that change.
>>> If you think I'm wrong, I won't fight a battle. This topic is
>>> orthogonal to the change I'm introducing in this patch, and I can
>>> continue the conversion independently from this discussion.  
>>
>> I don't think it is necessary to do that change for now.  When someone
>> really comes across this issue, we may make the error management code
>> robust.
>>
>>>   
>>>>> Thanks for taking the time to dig into this!    
>>>>
>>>> After looking into this patch and patch 31(though I've already provided my A-b)
>>>> more closely, I think the imx8qxp_pc and imx8{qm,qxp}_ldb main structures
>>>> should have the same life time with the embedded DRM bridges, because for
>>>> example the clk_apb clock in struct imx8qxp_pc would be accessed by the
>>>> imx8qxp_pc_bridge_mode_set DRM bridge callback.  But, IIUC, your patches extend
>>>> the life time for the embedded channel/bridge structures only, but not for the
>>>> main structures.  What do you think ?  
>>>
>>> I see you concern, but I'm sure the change I'm introducing is not
>>> creating the problem you are concerned about.
>>>
>>> The key aspect is that my patch is merely changing the lifetime of the
>>> _allocation_ of the drm_bridge, not its usage. On drm_bridge_remove()
>>> the bridge is removed from its encoder chain and it is completely not
>>> reachable, both before and after my patch. With my patch it is not
>>> freed immediately, but it's just a piece of "wasted" memory that is
>>> still allocated until elsewhere in the kernel there are pointers to it,
>>> to avoid use-after-free.
>>>
>>> With this explanation, do you think my patch is correct (after fixing
>>> the bug we already discussed of course)?  
>>
>> I tend to say your patch is not correct because we'll eventually make sure
>> that removing a bridge module is safe when doing atomic commit,
> 
> I think your sentence can be rephrased as "your patch is correct with
> the current code base where bridges are not (yet) removable, but there
> will be a problem when they start to actually be removable".
> 
> Is my understanding correct? If it is, I agree on that sentence.

Nope, I meant your patch should align the life times of the main structures
and the DRM bridges, for the sake of the kinda long term goal - remove bridge
driver module safely when doing atomic commit.

> 
> The work to have removable bridges is massive and non-trivial, so it
> will need to be tackled in steps. The grand plan [0] is:
> 
>  1. add refcounting to DRM bridges (struct drm_bridge)
>  2. handle gracefully atomic updates during bridge removal
>  3. avoid DSI host drivers to have dangling pointers to DSI devices 
>  4. finish the hotplug bridge work, removing the "always-disconnected"
>     connector, moving code to the core and potentially removing the
>     hotplug-bridge itself (this needs to be clarified as points 1-3 are
>     developed)

I'm busy with internal things these days and cannot look into the grand
plan and steps closely, sorry about that.

> 
> I am at step 1 right now. Removal during atomic updates is step 2,
> ideas about how to implement that are already being discussed [1],
> there's a practical plan proposed by Maxime with the goal of reaching
> removable bridges without breaking things along the path.
> 
> [0] https://lore.kernel.org/lkml/20250206-hotplug-drm-bridge-v6-0-9d6f2c9c3058@bootlin.com/
> [1] https://lore.kernel.org/all/20250106-vigorous-talented-viper-fa49d9@houat/
> 
>> which means
>> the main structures should have the same life time with the DRM bridges.
> 
> The word "lifetime" mean two things for bridges:
> 
>  * the time span during which memory is allocated for a struct
>    drm_bridge (along with the embedding struct)

Note that with your patch set the imx8*-ldb drivers and this bridge driver
won't allocate the DRM bridge along with the embedding struct. This makes
me worry, because maybe these drivers are the only "special" ones in this
patch set and I don't want them to be "special" after your patch set is
applied.

>  * the time span during which a DRM bridge is active/used/usable as
>    part of a card
>    - i.e. when it is part of an encoder chain
>    - i.e. when drm_bridge_funcs callbacks can be called
>    - i.e. from drm_bridge_add() to drm_bridge_remove()
> 
> These two lifetimes used to be nearly the same. Now the "memory
> allocation lifetime" is extended, but the "bridge existence" is
> unchanged: drm_bridge_add() to drm_bridge_remove() are called in the
> same place and do the same things, so the bridge will stop being in any
> encoder chain at the exact same time. now we are just keeping a piece of
> memory allocated for a longer time.
> 
> Seen in another way, the events used to be:
> 
>  * probe:
>    - allocate bridge
>    - drm_bridge_add()
> 
>  * remove
>    - drm_bridge_remove()
>    - now the bridge is not used, it's just some dead memory [*]
>    - kfree bridge (either in .remove() or just after by devm)
> 
> Now it becomes:
> 
>  * probe:
>    - allocate bridge
>    - drm_bridge_add()
> 
>  * remove
>    - drm_bridge_remove()
>    - now the bridge is not used, it's just some dead memory [*]
>    - maybe some more time, possibly long, until the last put [*]
>    - kfree bridge (by devm)
> 
> The duration of the [*] steps changes, but it's harmless because the
> bridge is not used at all. No change except for memory allocation.
> 
> Luca
>
Luca Ceresoli May 7, 2025, 2:13 p.m. UTC | #8
Hello Liu,

On Wed, 7 May 2025 18:16:28 +0800
Liu Ying <victor.liu@nxp.com> wrote:

[...]

> >>>> After looking into this patch and patch 31(though I've already provided my A-b)
> >>>> more closely, I think the imx8qxp_pc and imx8{qm,qxp}_ldb main structures
> >>>> should have the same life time with the embedded DRM bridges, because for
> >>>> example the clk_apb clock in struct imx8qxp_pc would be accessed by the
> >>>> imx8qxp_pc_bridge_mode_set DRM bridge callback.  But, IIUC, your patches extend
> >>>> the life time for the embedded channel/bridge structures only, but not for the
> >>>> main structures.  What do you think ?    
> >>>
> >>> I see you concern, but I'm sure the change I'm introducing is not
> >>> creating the problem you are concerned about.
> >>>
> >>> The key aspect is that my patch is merely changing the lifetime of the
> >>> _allocation_ of the drm_bridge, not its usage. On drm_bridge_remove()
> >>> the bridge is removed from its encoder chain and it is completely not
> >>> reachable, both before and after my patch. With my patch it is not
> >>> freed immediately, but it's just a piece of "wasted" memory that is
> >>> still allocated until elsewhere in the kernel there are pointers to it,
> >>> to avoid use-after-free.
> >>>
> >>> With this explanation, do you think my patch is correct (after fixing
> >>> the bug we already discussed of course)?    
> >>
> >> I tend to say your patch is not correct because we'll eventually make sure
> >> that removing a bridge module is safe when doing atomic commit,  
> > 
> > I think your sentence can be rephrased as "your patch is correct with
> > the current code base where bridges are not (yet) removable, but there
> > will be a problem when they start to actually be removable".
> > 
> > Is my understanding correct? If it is, I agree on that sentence.  
> 
> Nope, I meant your patch should align the life times of the main structures
> and the DRM bridges, for the sake of the kinda long term goal - remove bridge
> driver module safely when doing atomic commit.

Again, I don't think there is any bug introduced by this patch (once
the NULL ptr deref bug we already discussed is fixed). No bridge can be
removed as of now, with or without this patch.

You concern that this patch would make things more complex in the
future, when bridges will actually become removable and they could be
during atomic updates. But about this...

> > The work to have removable bridges is massive and non-trivial, so it
> > will need to be tackled in steps. The grand plan [0] is:
> > 
> >  1. add refcounting to DRM bridges (struct drm_bridge)
> >  2. handle gracefully atomic updates during bridge removal
> >  3. avoid DSI host drivers to have dangling pointers to DSI devices 
> >  4. finish the hotplug bridge work, removing the "always-disconnected"
> >     connector, moving code to the core and potentially removing the
> >     hotplug-bridge itself (this needs to be clarified as points 1-3 are
> >     developed)  
> 
> I'm busy with internal things these days and cannot look into the grand
> plan and steps closely, sorry about that.

...I'll wait until you have time to look into that more closely. There
is just no way to understand this whole topic without some dedicated
attention, which takes time unavoidably.

In the meanwhile I am going to send v3 soon with the known bug fixed,
so the best version is available to continue this discussion.

> > I am at step 1 right now. Removal during atomic updates is step 2,
> > ideas about how to implement that are already being discussed [1],
> > there's a practical plan proposed by Maxime with the goal of reaching
> > removable bridges without breaking things along the path.
> > 
> > [0] https://lore.kernel.org/lkml/20250206-hotplug-drm-bridge-v6-0-9d6f2c9c3058@bootlin.com/
> > [1] https://lore.kernel.org/all/20250106-vigorous-talented-viper-fa49d9@houat/
> >   
> >> which means
> >> the main structures should have the same life time with the DRM bridges.  
> > 
> > The word "lifetime" mean two things for bridges:
> > 
> >  * the time span during which memory is allocated for a struct
> >    drm_bridge (along with the embedding struct)  
> 
> Note that with your patch set the imx8*-ldb drivers and this bridge driver
> won't allocate the DRM bridge along with the embedding struct.

By "embedding struct" I mean the struct imx8qxp_pc_channel that embeds
the struct drm_bridge. Sorry, I realize my wording was ambiguous.

> This makes
> me worry, because maybe these drivers are the only "special" ones in this
> patch set and I don't want them to be "special" after your patch set is
> applied.

Luca
diff mbox series

Patch

diff --git a/drivers/gpu/drm/bridge/imx/imx8qxp-pixel-combiner.c b/drivers/gpu/drm/bridge/imx/imx8qxp-pixel-combiner.c
index 1f6fd488e7039e943351006d3373009f0c15cb08..40a8a5a53a781137e722309ff91692cf90d881da 100644
--- a/drivers/gpu/drm/bridge/imx/imx8qxp-pixel-combiner.c
+++ b/drivers/gpu/drm/bridge/imx/imx8qxp-pixel-combiner.c
@@ -68,7 +68,7 @@  struct imx8qxp_pc_channel {
 
 struct imx8qxp_pc {
 	struct device *dev;
-	struct imx8qxp_pc_channel ch[2];
+	struct imx8qxp_pc_channel *ch[2];
 	struct clk *clk_apb;
 	void __iomem *base;
 };
@@ -307,7 +307,14 @@  static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
 			goto free_child;
 		}
 
-		ch = &pc->ch[i];
+		ch = devm_drm_bridge_alloc(dev, struct imx8qxp_pc_channel, bridge,
+					   &imx8qxp_pc_bridge_funcs);
+		if (IS_ERR(ch)) {
+			ret = PTR_ERR(ch);
+			goto free_child;
+		}
+
+		pc->ch[i] = ch;
 		ch->pc = pc;
 		ch->stream_id = i;
 
@@ -333,7 +340,6 @@  static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
 		of_node_put(remote);
 
 		ch->bridge.driver_private = ch;
-		ch->bridge.funcs = &imx8qxp_pc_bridge_funcs;
 		ch->bridge.of_node = child;
 		ch->is_available = true;
 
@@ -345,8 +351,8 @@  static int imx8qxp_pc_bridge_probe(struct platform_device *pdev)
 free_child:
 	of_node_put(child);
 
-	if (i == 1 && pc->ch[0].next_bridge)
-		drm_bridge_remove(&pc->ch[0].bridge);
+	if (i == 1 && pc->ch[0]->next_bridge)
+		drm_bridge_remove(&pc->ch[0]->bridge);
 
 	pm_runtime_disable(dev);
 	return ret;
@@ -359,7 +365,7 @@  static void imx8qxp_pc_bridge_remove(struct platform_device *pdev)
 	int i;
 
 	for (i = 0; i < 2; i++) {
-		ch = &pc->ch[i];
+		ch = pc->ch[i];
 
 		if (!ch->is_available)
 			continue;