diff mbox series

[v2,5/5] soundwire: bus: Don't exit early if no device IDs were programmed

Message ID 20220907085259.3602-6-rf@opensource.cirrus.com
State New
Headers show
Series soundwire: Fixes for spurious and missing UNATTACH | expand

Commit Message

Richard Fitzgerald Sept. 7, 2022, 8:52 a.m. UTC
Only exit sdw_handle_slave_status() right after calling
sdw_program_device_num() if it actually programmed an ID into at
least one device.

sdw_handle_slave_status() should protect itself against phantom
device #0 ATTACHED indications. In that case there is no actual
device still on #0. The early exit relies on there being a status
change to ATTACHED on the reprogrammed device to trigger another
call to sdw_handle_slave_status() which will then handle the status
of all peripherals. If no device was actually programmed with an
ID there won't be a new ATTACHED indication. This can lead to the
status of other peripherals not being handled.

The status passed to sdw_handle_slave_status() is obviously always
from a point of time in the past, and may indicate accumulated
unhandled events (depending how the bus manager operates). It's
possible that a device ID is reprogrammed but the last PING status
captured state just before that, when it was still reporting on
ID #0. Then sdw_handle_slave_status() is called with this PING info,
just before a new PING status is available showing it now on its new
ID. So sdw_handle_slave_status() will receive a phantom report of a
device on #0, but it will not find one.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
---
 drivers/soundwire/bus.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

Comments

Pierre-Louis Bossart Sept. 12, 2022, 11:43 a.m. UTC | #1
On 9/7/22 10:52, Richard Fitzgerald wrote:
> Only exit sdw_handle_slave_status() right after calling
> sdw_program_device_num() if it actually programmed an ID into at
> least one device.
> 
> sdw_handle_slave_status() should protect itself against phantom
> device #0 ATTACHED indications. In that case there is no actual
> device still on #0. The early exit relies on there being a status
> change to ATTACHED on the reprogrammed device to trigger another
> call to sdw_handle_slave_status() which will then handle the status
> of all peripherals. If no device was actually programmed with an
> ID there won't be a new ATTACHED indication. This can lead to the
> status of other peripherals not being handled.
> 
> The status passed to sdw_handle_slave_status() is obviously always
> from a point of time in the past, and may indicate accumulated
> unhandled events (depending how the bus manager operates). It's
> possible that a device ID is reprogrammed but the last PING status
> captured state just before that, when it was still reporting on
> ID #0. Then sdw_handle_slave_status() is called with this PING info,
> just before a new PING status is available showing it now on its new
> ID. So sdw_handle_slave_status() will receive a phantom report of a
> device on #0, but it will not find one.
> 
> Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
> ---
>  drivers/soundwire/bus.c | 27 +++++++++++++++------------
>  1 file changed, 15 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
> index 6e569a875a9b..0bcc2d161eb9 100644
> --- a/drivers/soundwire/bus.c
> +++ b/drivers/soundwire/bus.c
> @@ -736,20 +736,19 @@ static int sdw_program_device_num(struct sdw_bus *bus)
>  	struct sdw_slave_id id;
>  	struct sdw_msg msg;
>  	bool found;
> -	int count = 0, ret;
> +	int count = 0, num_programmed = 0, ret;
>  	u64 addr;
>  
>  	/* No Slave, so use raw xfer api */
>  	ret = sdw_fill_msg(&msg, NULL, SDW_SCP_DEVID_0,
>  			   SDW_NUM_DEV_ID_REGISTERS, 0, SDW_MSG_FLAG_READ, buf);
>  	if (ret < 0)
> -		return ret;
> +		return 0;

this doesn't seem quite right to me, there are multiple -EINVAL cases
handled in sdw_fill_msg().

I didn't check if all these error cases are irrelevant in that specific
enumeration case, if that was the case maybe we need to break that
function in two helpers so that all the checks can be skipped.

>  
>  	do {
>  		ret = sdw_transfer(bus, &msg);
>  		if (ret == -ENODATA) { /* end of device id reads */
>  			dev_dbg(bus->dev, "No more devices to enumerate\n");
> -			ret = 0;
>  			break;
>  		}
>  		if (ret < 0) {
> @@ -781,7 +780,7 @@ static int sdw_program_device_num(struct sdw_bus *bus)
>  				 * assigned a device ID.
>  				 */
>  				if (slave->status != SDW_SLAVE_UNATTACHED)
> -					return 0;
> +					return num_programmed;
>  
>  				/*
>  				 * Assign a new dev_num to this Slave and
> @@ -794,9 +793,11 @@ static int sdw_program_device_num(struct sdw_bus *bus)
>  					dev_err(bus->dev,
>  						"Assign dev_num failed:%d\n",
>  						ret);
> -					return ret;
> +					return num_programmed;
>  				}
>  
> +				++num_programmed;
> +
>  				break;
>  			}
>  		}
> @@ -825,7 +826,7 @@ static int sdw_program_device_num(struct sdw_bus *bus)
>  
>  	} while (ret == 0 && count < (SDW_MAX_DEVICES * 2));
>  
> -	return ret;
> +	return num_programmed;
>  }
>  
>  static void sdw_modify_slave_status(struct sdw_slave *slave,
> @@ -1787,14 +1788,16 @@ int sdw_handle_slave_status(struct sdw_bus *bus,
>  
>  	if (status[0] == SDW_SLAVE_ATTACHED) {
>  		dev_dbg(bus->dev, "Slave attached, programming device number\n");
> -		ret = sdw_program_device_num(bus);
> -		if (ret < 0)
> -			dev_err(bus->dev, "Slave attach failed: %d\n", ret);
> +
>  		/*
> -		 * programming a device number will have side effects,
> -		 * so we deal with other devices at a later time
> +		 * Programming a device number will have side effects,
> +		 * so we deal with other devices at a later time.
> +		 * But only if any devices were reprogrammed, because
> +		 * this relies on its PING state changing to ATTACHED,
> +		 * triggering a status change.
>  		 */
> -		return ret;
> +		if (sdw_program_device_num(bus))
> +			return 0;
>  	}
>  
>  	/* Continue to check other slave statuses */
Richard Fitzgerald Sept. 12, 2022, 12:25 p.m. UTC | #2
On 12/09/2022 12:43, Pierre-Louis Bossart wrote:
> 
> 
> On 9/7/22 10:52, Richard Fitzgerald wrote:
>> Only exit sdw_handle_slave_status() right after calling
>> sdw_program_device_num() if it actually programmed an ID into at
>> least one device.
>>
>> sdw_handle_slave_status() should protect itself against phantom
>> device #0 ATTACHED indications. In that case there is no actual
>> device still on #0. The early exit relies on there being a status
>> change to ATTACHED on the reprogrammed device to trigger another
>> call to sdw_handle_slave_status() which will then handle the status
>> of all peripherals. If no device was actually programmed with an
>> ID there won't be a new ATTACHED indication. This can lead to the
>> status of other peripherals not being handled.
>>
>> The status passed to sdw_handle_slave_status() is obviously always
>> from a point of time in the past, and may indicate accumulated
>> unhandled events (depending how the bus manager operates). It's
>> possible that a device ID is reprogrammed but the last PING status
>> captured state just before that, when it was still reporting on
>> ID #0. Then sdw_handle_slave_status() is called with this PING info,
>> just before a new PING status is available showing it now on its new
>> ID. So sdw_handle_slave_status() will receive a phantom report of a
>> device on #0, but it will not find one.
>>
>> Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
>> ---
>>   drivers/soundwire/bus.c | 27 +++++++++++++++------------
>>   1 file changed, 15 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
>> index 6e569a875a9b..0bcc2d161eb9 100644
>> --- a/drivers/soundwire/bus.c
>> +++ b/drivers/soundwire/bus.c
>> @@ -736,20 +736,19 @@ static int sdw_program_device_num(struct sdw_bus *bus)
>>   	struct sdw_slave_id id;
>>   	struct sdw_msg msg;
>>   	bool found;
>> -	int count = 0, ret;
>> +	int count = 0, num_programmed = 0, ret;
>>   	u64 addr;
>>   
>>   	/* No Slave, so use raw xfer api */
>>   	ret = sdw_fill_msg(&msg, NULL, SDW_SCP_DEVID_0,
>>   			   SDW_NUM_DEV_ID_REGISTERS, 0, SDW_MSG_FLAG_READ, buf);
>>   	if (ret < 0)
>> -		return ret;
>> +		return 0;
> 
> this doesn't seem quite right to me, there are multiple -EINVAL cases
> handled in sdw_fill_msg().
> 
> I didn't check if all these error cases are irrelevant in that specific
> enumeration case, if that was the case maybe we need to break that
> function in two helpers so that all the checks can be skipped.
> 

I don't think that there's anything useful that
sdw_modify_slave_status() could do to recover from an error.

If any device IDs were programmed then, according to the statement in
sdw_modify_slave_status()

	* programming a device number will have side effects,
	* so we deal with other devices at a later time

if this is true, then we need to exit to deal with what _was_
programmed, even if one of them failed.

If nothing was programmed, and there was an error, we can't bail out of
sdw_modify_slave_status(). We have status for other devices which
we can't simply ignore.

Ultimately I can't see how pushing the error code up is useful.
sdw_modify_slave_status() can't really do any effective recovery action,
and the original behavior of giving up and returning means that
an error in programming dev ID potentially causes collateral damage to
the status of other peripherals.

>>   
>>   	do {
>>   		ret = sdw_transfer(bus, &msg);
>>   		if (ret == -ENODATA) { /* end of device id reads */
>>   			dev_dbg(bus->dev, "No more devices to enumerate\n");
>> -			ret = 0;
>>   			break;
>>   		}
>>   		if (ret < 0) {
>> @@ -781,7 +780,7 @@ static int sdw_program_device_num(struct sdw_bus *bus)
>>   				 * assigned a device ID.
>>   				 */
>>   				if (slave->status != SDW_SLAVE_UNATTACHED)
>> -					return 0;
>> +					return num_programmed;
>>   
>>   				/*
>>   				 * Assign a new dev_num to this Slave and
>> @@ -794,9 +793,11 @@ static int sdw_program_device_num(struct sdw_bus *bus)
>>   					dev_err(bus->dev,
>>   						"Assign dev_num failed:%d\n",
>>   						ret);
>> -					return ret;
>> +					return num_programmed;
>>   				}
>>   
>> +				++num_programmed;
>> +
>>   				break;
>>   			}
>>   		}
>> @@ -825,7 +826,7 @@ static int sdw_program_device_num(struct sdw_bus *bus)
>>   
>>   	} while (ret == 0 && count < (SDW_MAX_DEVICES * 2));
>>   
>> -	return ret;
>> +	return num_programmed;
>>   }
>>   
>>   static void sdw_modify_slave_status(struct sdw_slave *slave,
>> @@ -1787,14 +1788,16 @@ int sdw_handle_slave_status(struct sdw_bus *bus,
>>   
>>   	if (status[0] == SDW_SLAVE_ATTACHED) {
>>   		dev_dbg(bus->dev, "Slave attached, programming device number\n");
>> -		ret = sdw_program_device_num(bus);
>> -		if (ret < 0)
>> -			dev_err(bus->dev, "Slave attach failed: %d\n", ret);
>> +
>>   		/*
>> -		 * programming a device number will have side effects,
>> -		 * so we deal with other devices at a later time
>> +		 * Programming a device number will have side effects,
>> +		 * so we deal with other devices at a later time.
>> +		 * But only if any devices were reprogrammed, because
>> +		 * this relies on its PING state changing to ATTACHED,
>> +		 * triggering a status change.
>>   		 */
>> -		return ret;
>> +		if (sdw_program_device_num(bus))
>> +			return 0;
>>   	}
>>   
>>   	/* Continue to check other slave statuses */
Pierre-Louis Bossart Sept. 12, 2022, 5:09 p.m. UTC | #3
On 9/12/22 14:25, Richard Fitzgerald wrote:
> On 12/09/2022 12:43, Pierre-Louis Bossart wrote:
>>
>>
>> On 9/7/22 10:52, Richard Fitzgerald wrote:
>>> Only exit sdw_handle_slave_status() right after calling
>>> sdw_program_device_num() if it actually programmed an ID into at
>>> least one device.
>>>
>>> sdw_handle_slave_status() should protect itself against phantom
>>> device #0 ATTACHED indications. In that case there is no actual
>>> device still on #0. The early exit relies on there being a status
>>> change to ATTACHED on the reprogrammed device to trigger another
>>> call to sdw_handle_slave_status() which will then handle the status
>>> of all peripherals. If no device was actually programmed with an
>>> ID there won't be a new ATTACHED indication. This can lead to the
>>> status of other peripherals not being handled.
>>>
>>> The status passed to sdw_handle_slave_status() is obviously always
>>> from a point of time in the past, and may indicate accumulated
>>> unhandled events (depending how the bus manager operates). It's
>>> possible that a device ID is reprogrammed but the last PING status
>>> captured state just before that, when it was still reporting on
>>> ID #0. Then sdw_handle_slave_status() is called with this PING info,
>>> just before a new PING status is available showing it now on its new
>>> ID. So sdw_handle_slave_status() will receive a phantom report of a
>>> device on #0, but it will not find one.
>>>
>>> Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
>>> ---
>>>   drivers/soundwire/bus.c | 27 +++++++++++++++------------
>>>   1 file changed, 15 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
>>> index 6e569a875a9b..0bcc2d161eb9 100644
>>> --- a/drivers/soundwire/bus.c
>>> +++ b/drivers/soundwire/bus.c
>>> @@ -736,20 +736,19 @@ static int sdw_program_device_num(struct
>>> sdw_bus *bus)
>>>       struct sdw_slave_id id;
>>>       struct sdw_msg msg;
>>>       bool found;
>>> -    int count = 0, ret;
>>> +    int count = 0, num_programmed = 0, ret;
>>>       u64 addr;
>>>         /* No Slave, so use raw xfer api */
>>>       ret = sdw_fill_msg(&msg, NULL, SDW_SCP_DEVID_0,
>>>                  SDW_NUM_DEV_ID_REGISTERS, 0, SDW_MSG_FLAG_READ, buf);
>>>       if (ret < 0)
>>> -        return ret;
>>> +        return 0;
>>
>> this doesn't seem quite right to me, there are multiple -EINVAL cases
>> handled in sdw_fill_msg().
>>
>> I didn't check if all these error cases are irrelevant in that specific
>> enumeration case, if that was the case maybe we need to break that
>> function in two helpers so that all the checks can be skipped.
>>
> 
> I don't think that there's anything useful that
> sdw_modify_slave_status() could do to recover from an error.
> 
> If any device IDs were programmed then, according to the statement in
> sdw_modify_slave_status()
> 
>     * programming a device number will have side effects,
>     * so we deal with other devices at a later time
> 
> if this is true, then we need to exit to deal with what _was_
> programmed, even if one of them failed.
> 
> If nothing was programmed, and there was an error, we can't bail out of
> sdw_modify_slave_status(). We have status for other devices which
> we can't simply ignore.
> 
> Ultimately I can't see how pushing the error code up is useful.
> sdw_modify_slave_status() can't really do any effective recovery action,
> and the original behavior of giving up and returning means that
> an error in programming dev ID potentially causes collateral damage to
> the status of other peripherals.

I was suggesting something like


void sdw_fill_msg_data(...)
{
  copy data in the msg structure
}

int sdw_fill_msg(...)
{
    sdw_fill_msg_data();
    handle_error_cases
}

and in sdw sdw_program_device_num() we call directly sdw_fill_msg_data()

So no change in functionality beyond explicit skip of error checks that
are not relevant and cannot be handled even if they were.
diff mbox series

Patch

diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index 6e569a875a9b..0bcc2d161eb9 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -736,20 +736,19 @@  static int sdw_program_device_num(struct sdw_bus *bus)
 	struct sdw_slave_id id;
 	struct sdw_msg msg;
 	bool found;
-	int count = 0, ret;
+	int count = 0, num_programmed = 0, ret;
 	u64 addr;
 
 	/* No Slave, so use raw xfer api */
 	ret = sdw_fill_msg(&msg, NULL, SDW_SCP_DEVID_0,
 			   SDW_NUM_DEV_ID_REGISTERS, 0, SDW_MSG_FLAG_READ, buf);
 	if (ret < 0)
-		return ret;
+		return 0;
 
 	do {
 		ret = sdw_transfer(bus, &msg);
 		if (ret == -ENODATA) { /* end of device id reads */
 			dev_dbg(bus->dev, "No more devices to enumerate\n");
-			ret = 0;
 			break;
 		}
 		if (ret < 0) {
@@ -781,7 +780,7 @@  static int sdw_program_device_num(struct sdw_bus *bus)
 				 * assigned a device ID.
 				 */
 				if (slave->status != SDW_SLAVE_UNATTACHED)
-					return 0;
+					return num_programmed;
 
 				/*
 				 * Assign a new dev_num to this Slave and
@@ -794,9 +793,11 @@  static int sdw_program_device_num(struct sdw_bus *bus)
 					dev_err(bus->dev,
 						"Assign dev_num failed:%d\n",
 						ret);
-					return ret;
+					return num_programmed;
 				}
 
+				++num_programmed;
+
 				break;
 			}
 		}
@@ -825,7 +826,7 @@  static int sdw_program_device_num(struct sdw_bus *bus)
 
 	} while (ret == 0 && count < (SDW_MAX_DEVICES * 2));
 
-	return ret;
+	return num_programmed;
 }
 
 static void sdw_modify_slave_status(struct sdw_slave *slave,
@@ -1787,14 +1788,16 @@  int sdw_handle_slave_status(struct sdw_bus *bus,
 
 	if (status[0] == SDW_SLAVE_ATTACHED) {
 		dev_dbg(bus->dev, "Slave attached, programming device number\n");
-		ret = sdw_program_device_num(bus);
-		if (ret < 0)
-			dev_err(bus->dev, "Slave attach failed: %d\n", ret);
+
 		/*
-		 * programming a device number will have side effects,
-		 * so we deal with other devices at a later time
+		 * Programming a device number will have side effects,
+		 * so we deal with other devices at a later time.
+		 * But only if any devices were reprogrammed, because
+		 * this relies on its PING state changing to ATTACHED,
+		 * triggering a status change.
 		 */
-		return ret;
+		if (sdw_program_device_num(bus))
+			return 0;
 	}
 
 	/* Continue to check other slave statuses */