mbox series

[v2,0/2] Fix problems fetching TBT3 DROM from AMD USB4 routers

Message ID 20230215172520.8925-1-mario.limonciello@amd.com
Headers show
Series Fix problems fetching TBT3 DROM from AMD USB4 routers | expand

Message

Mario Limonciello Feb. 15, 2023, 5:25 p.m. UTC
TBT3 devices when connected to an AMD USB4 router occasionally fail to
properly respond to requests for the DROM via bit banging.

Depending upon which part of the request failed will impact the severity.
A number of workarounds have been put in place to let the driver handle
the failed requests:

commit e87491a9fd4e3 ("thunderbolt: Retry DROM reads for more failure scenarios")
commit a283de3ec646f ("thunderbolt: Do not resume routers if UID is not set")
commit 6915812bbd109 ("thunderbolt: Do not make DROM read success compulsory")
commit f022ff7bf377 ("thunderbolt: Retry DROM read once if parsing fails")

Still even with these changes the failures do make it through. In comparing
other CM implementations utilized on AMD systems, they all access the
DROM directly from the NVM.

To avoid triggering this issue, try to get the DROM directly from the NVM
in Linux as well before resorting to bitbanging.

Mario Limonciello (2):
  thunderbolt: Adjust how NVM reading works
  thunderbolt: use `tb_eeprom_get_drom_offset` to discover DROM offset

 drivers/thunderbolt/eeprom.c | 150 +++++++++++++++++++----------------
 1 file changed, 82 insertions(+), 68 deletions(-)

Comments

Mika Westerberg Feb. 16, 2023, 5:49 a.m. UTC | #1
Hi Mario,

On Wed, Feb 15, 2023 at 11:25:19AM -0600, Mario Limonciello wrote:
> Some TBT3 devices have a hard time reliably responding to bit banging
> requests correctly when connected to AMD USB4 hosts running Linux.
> 
> These problems are not reported in any other CM supported on AMD platforms,
> and comparing the Windows and Pre-OS implementations the Linux CM is the
> only one that utilizes bit banging to access the DROM.
> Other CM implementations access the DROM directly from the NVM instead of
> bit banging.
> 
> Adjust the flow to use this method to fetch the NVM when the downstream
> device is Thunderbolt 3 and only use bit banging to access TBT 2 or TBT 1
> devices. As the flow is modified, also remove the retry sequence that was
> introduced from commit f022ff7bf377 ("thunderbolt: Retry DROM read once
> if parsing fails") as it will not be necessary if the NVM is fetched this
> way.
> 
> Cc: stable@vger.kernel.org
> Fixes: f022ff7bf377 ("thunderbolt: Retry DROM read once if parsing fails")

I don't think it fixes a regression of above commit and I don't think
this is stable material because it is quite a big change. I would rather
let it sit in -rcX for a while to make sure no user visible changes are
accidentally introduced. Is this OK for you?

Did you check that the UUID of these (and other possible) devices stay
the same before and after the patch?

> Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
> ---
> v1->v2:
>  * Update commit message to indicate which CMs are tested
>  * Adjust flow to only fetch DROM from NVM on TBT3 and bit bang on TBT1/2
> ---
>  drivers/thunderbolt/eeprom.c | 145 +++++++++++++++++++----------------
>  1 file changed, 80 insertions(+), 65 deletions(-)
> 
> diff --git a/drivers/thunderbolt/eeprom.c b/drivers/thunderbolt/eeprom.c
> index c90d22f56d4e..d1be72b6afdb 100644
> --- a/drivers/thunderbolt/eeprom.c
> +++ b/drivers/thunderbolt/eeprom.c
> @@ -416,7 +416,7 @@ static int tb_drom_parse_entries(struct tb_switch *sw, size_t header_size)
>  		if (pos + 1 == drom_size || pos + entry->len > drom_size
>  				|| !entry->len) {
>  			tb_sw_warn(sw, "DROM buffer overrun\n");
> -			return -EILSEQ;
> +			return -EIO;
>  		}
>  
>  		switch (entry->type) {
> @@ -544,7 +544,37 @@ static int tb_drom_read_n(struct tb_switch *sw, u16 offset, u8 *val,
>  	return tb_eeprom_read_n(sw, offset, val, count);
>  }
>  
> -static int tb_drom_parse(struct tb_switch *sw)
> +static int tb_drom_bit_bang(struct tb_switch *sw, u16 *size)
> +{
> +	int res;
> +
> +	res = tb_drom_read_n(sw, 14, (u8 *) size, 2);
> +	if (res)
> +		return res;
> +	*size &= 0x3ff;
> +	*size += TB_DROM_DATA_START;
> +	tb_sw_dbg(sw, "reading drom (length: %#x)\n", *size);
> +	if (*size < sizeof(struct tb_drom_header)) {
> +		tb_sw_warn(sw, "drom too small, aborting\n");
> +		return -EIO;
> +	}
> +
> +	sw->drom = kzalloc(*size, GFP_KERNEL);
> +	if (!sw->drom)
> +		return -ENOMEM;
> +
> +	res = tb_drom_read_n(sw, 0, sw->drom, *size);
> +	if (res)
> +		goto err;
> +
> +	return 0;
> +err:
> +	kfree(sw->drom);
> +	sw->drom = NULL;
> +	return res;
> +}

Can you split the refactoring part into a separate patch?

> +
> +static int tb_drom_parse_v1(struct tb_switch *sw)
>  {
>  	const struct tb_drom_header *header =
>  		(const struct tb_drom_header *)sw->drom;
> @@ -555,7 +585,7 @@ static int tb_drom_parse(struct tb_switch *sw)
>  		tb_sw_warn(sw,
>  			"DROM UID CRC8 mismatch (expected: %#x, got: %#x)\n",
>  			header->uid_crc8, crc);
> -		return -EILSEQ;
> +		return -EIO;
>  	}
>  	if (!sw->uid)
>  		sw->uid = header->uid;
> @@ -589,6 +619,43 @@ static int usb4_drom_parse(struct tb_switch *sw)
>  	return tb_drom_parse_entries(sw, USB4_DROM_HEADER_SIZE);
>  }
>  
> +static int tb_drom_parse(struct tb_switch *sw, u16 *size)
> +{
> +	struct tb_drom_header *header = (void *) sw->drom;
> +	int res;
> +
> +	if (header->data_len + TB_DROM_DATA_START != *size) {
> +		tb_sw_warn(sw, "drom size mismatch\n");
> +		goto err;
> +	}
> +
> +	tb_sw_dbg(sw, "DROM version: %d\n", header->device_rom_revision);
> +
> +	switch (header->device_rom_revision) {
> +	case 3:
> +		res = usb4_drom_parse(sw);
> +		break;
> +	default:
> +		tb_sw_warn(sw, "DROM device_rom_revision %#x unknown\n",
> +			   header->device_rom_revision);
> +		fallthrough;
> +	case 1:
> +		res = tb_drom_parse_v1(sw);
> +		break;
> +	}
> +
> +	if (res) {
> +		tb_sw_warn(sw, "parsing DROM failed\n");
> +		goto err;
> +	}
> +
> +	return 0;
> +err:
> +	kfree(sw->drom);
> +	sw->drom = NULL;
> +	return -EIO;
> +}
> +
>  /**
>   * tb_drom_read() - Copy DROM to sw->drom and parse it
>   * @sw: Router whose DROM to read and parse
> @@ -602,8 +669,7 @@ static int usb4_drom_parse(struct tb_switch *sw)
>  int tb_drom_read(struct tb_switch *sw)
>  {
>  	u16 size;
> -	struct tb_drom_header *header;
> -	int res, retries = 1;
> +	int res;
>  
>  	if (sw->drom)
>  		return 0;
> @@ -614,11 +680,11 @@ int tb_drom_read(struct tb_switch *sw)
>  		 * in a device property. Use it if available.
>  		 */
>  		if (tb_drom_copy_efi(sw, &size) == 0)
> -			goto parse;
> +			return tb_drom_parse(sw, &size);
>  
>  		/* Non-Apple hardware has the DROM as part of NVM */
>  		if (tb_drom_copy_nvm(sw, &size) == 0)
> -			goto parse;
> +			return tb_drom_parse(sw, &size);
>  
>  		/*
>  		 * USB4 hosts may support reading DROM through router
> @@ -627,7 +693,7 @@ int tb_drom_read(struct tb_switch *sw)
>  		if (tb_switch_is_usb4(sw)) {
>  			usb4_switch_read_uid(sw, &sw->uid);
>  			if (!usb4_copy_host_drom(sw, &size))
> -				goto parse;
> +				return tb_drom_parse(sw, &size);
>  		} else {
>  			/*
>  			 * The root switch contains only a dummy drom
> @@ -640,64 +706,13 @@ int tb_drom_read(struct tb_switch *sw)
>  		return 0;
>  	}
>  
> -	res = tb_drom_read_n(sw, 14, (u8 *) &size, 2);
> +	/* TBT3 devices have the DROM as part of NVM */
> +	if (sw->generation < 3)

This is true for TBT2 devices too. I think you want to check for the
sw->cap_lc here instead. If it is set the device has LC and therefore we
can use the LC UUID registers to figure out the UUID in later stages.
Otherwise we need to read it through bitbanging.

> +		res = tb_drom_bit_bang(sw, &size);
> +	else
> +		res = tb_drom_copy_nvm(sw, &size);
>  	if (res)
>  		return res;
> -	size &= 0x3ff;
> -	size += TB_DROM_DATA_START;
> -	tb_sw_dbg(sw, "reading drom (length: %#x)\n", size);
> -	if (size < sizeof(*header)) {
> -		tb_sw_warn(sw, "drom too small, aborting\n");
> -		return -EIO;
> -	}
> -
> -	sw->drom = kzalloc(size, GFP_KERNEL);
> -	if (!sw->drom)
> -		return -ENOMEM;
> -read:
> -	res = tb_drom_read_n(sw, 0, sw->drom, size);
> -	if (res)
> -		goto err;
> -
> -parse:
> -	header = (void *) sw->drom;
> -
> -	if (header->data_len + TB_DROM_DATA_START != size) {
> -		tb_sw_warn(sw, "drom size mismatch\n");
> -		if (retries--) {
> -			msleep(100);
> -			goto read;
> -		}
> -		goto err;
> -	}
>  
> -	tb_sw_dbg(sw, "DROM version: %d\n", header->device_rom_revision);
> -
> -	switch (header->device_rom_revision) {
> -	case 3:
> -		res = usb4_drom_parse(sw);
> -		break;
> -	default:
> -		tb_sw_warn(sw, "DROM device_rom_revision %#x unknown\n",
> -			   header->device_rom_revision);
> -		fallthrough;
> -	case 1:
> -		res = tb_drom_parse(sw);
> -		break;
> -	}
> -
> -	/* If the DROM parsing fails, wait a moment and retry once */
> -	if (res == -EILSEQ && retries--) {
> -		tb_sw_warn(sw, "parsing DROM failed\n");
> -		msleep(100);
> -		goto read;
> -	}
> -
> -	if (!res)
> -		return 0;
> -
> -err:
> -	kfree(sw->drom);
> -	sw->drom = NULL;
> -	return -EIO;
> +	return tb_drom_parse(sw, &size);
>  }
> -- 
> 2.34.1
Mario Limonciello Feb. 16, 2023, 6:04 a.m. UTC | #2
On 2/15/23 23:49, Mika Westerberg wrote:
> Hi Mario,
>
> On Wed, Feb 15, 2023 at 11:25:19AM -0600, Mario Limonciello wrote:
>> Some TBT3 devices have a hard time reliably responding to bit banging
>> requests correctly when connected to AMD USB4 hosts running Linux.
>>
>> These problems are not reported in any other CM supported on AMD platforms,
>> and comparing the Windows and Pre-OS implementations the Linux CM is the
>> only one that utilizes bit banging to access the DROM.
>> Other CM implementations access the DROM directly from the NVM instead of
>> bit banging.
>>
>> Adjust the flow to use this method to fetch the NVM when the downstream
>> device is Thunderbolt 3 and only use bit banging to access TBT 2 or TBT 1
>> devices. As the flow is modified, also remove the retry sequence that was
>> introduced from commit f022ff7bf377 ("thunderbolt: Retry DROM read once
>> if parsing fails") as it will not be necessary if the NVM is fetched this
>> way.
>>
>> Cc: stable@vger.kernel.org
>> Fixes: f022ff7bf377 ("thunderbolt: Retry DROM read once if parsing fails")
> I don't think it fixes a regression of above commit and I don't think
> this is stable material because it is quite a big change. I would rather
> let it sit in -rcX for a while to make sure no user visible changes are
> accidentally introduced. Is this OK for you?

No worry on dropping these two tags.  I'll do that for a v3.
The original change in v1 was more stable material than this is now.

While adopting your feedback below I'll keep in mind patch ordering
for refactoring as I do want to get the core of the solution back to stable
eventually when we're collectively confident on it.

> Did you check that the UUID of these (and other possible) devices stay
> the same before and after the patch?
Yeah I did across a few hotplug cycles and suspend cycles.
>
>> Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
>> ---
>> v1->v2:
>>   * Update commit message to indicate which CMs are tested
>>   * Adjust flow to only fetch DROM from NVM on TBT3 and bit bang on TBT1/2
>> ---
>>   drivers/thunderbolt/eeprom.c | 145 +++++++++++++++++++----------------
>>   1 file changed, 80 insertions(+), 65 deletions(-)
>>
>> diff --git a/drivers/thunderbolt/eeprom.c b/drivers/thunderbolt/eeprom.c
>> index c90d22f56d4e..d1be72b6afdb 100644
>> --- a/drivers/thunderbolt/eeprom.c
>> +++ b/drivers/thunderbolt/eeprom.c
>> @@ -416,7 +416,7 @@ static int tb_drom_parse_entries(struct tb_switch *sw, size_t header_size)
>>   		if (pos + 1 == drom_size || pos + entry->len > drom_size
>>   				|| !entry->len) {
>>   			tb_sw_warn(sw, "DROM buffer overrun\n");
>> -			return -EILSEQ;
>> +			return -EIO;
>>   		}
>>   
>>   		switch (entry->type) {
>> @@ -544,7 +544,37 @@ static int tb_drom_read_n(struct tb_switch *sw, u16 offset, u8 *val,
>>   	return tb_eeprom_read_n(sw, offset, val, count);
>>   }
>>   
>> -static int tb_drom_parse(struct tb_switch *sw)
>> +static int tb_drom_bit_bang(struct tb_switch *sw, u16 *size)
>> +{
>> +	int res;
>> +
>> +	res = tb_drom_read_n(sw, 14, (u8 *) size, 2);
>> +	if (res)
>> +		return res;
>> +	*size &= 0x3ff;
>> +	*size += TB_DROM_DATA_START;
>> +	tb_sw_dbg(sw, "reading drom (length: %#x)\n", *size);
>> +	if (*size < sizeof(struct tb_drom_header)) {
>> +		tb_sw_warn(sw, "drom too small, aborting\n");
>> +		return -EIO;
>> +	}
>> +
>> +	sw->drom = kzalloc(*size, GFP_KERNEL);
>> +	if (!sw->drom)
>> +		return -ENOMEM;
>> +
>> +	res = tb_drom_read_n(sw, 0, sw->drom, *size);
>> +	if (res)
>> +		goto err;
>> +
>> +	return 0;
>> +err:
>> +	kfree(sw->drom);
>> +	sw->drom = NULL;
>> +	return res;
>> +}
> Can you split the refactoring part into a separate patch?

Sure.

>> +
>> +static int tb_drom_parse_v1(struct tb_switch *sw)
>>   {
>>   	const struct tb_drom_header *header =
>>   		(const struct tb_drom_header *)sw->drom;
>> @@ -555,7 +585,7 @@ static int tb_drom_parse(struct tb_switch *sw)
>>   		tb_sw_warn(sw,
>>   			"DROM UID CRC8 mismatch (expected: %#x, got: %#x)\n",
>>   			header->uid_crc8, crc);
>> -		return -EILSEQ;
>> +		return -EIO;
>>   	}
>>   	if (!sw->uid)
>>   		sw->uid = header->uid;
>> @@ -589,6 +619,43 @@ static int usb4_drom_parse(struct tb_switch *sw)
>>   	return tb_drom_parse_entries(sw, USB4_DROM_HEADER_SIZE);
>>   }
>>   
>> +static int tb_drom_parse(struct tb_switch *sw, u16 *size)
>> +{
>> +	struct tb_drom_header *header = (void *) sw->drom;
>> +	int res;
>> +
>> +	if (header->data_len + TB_DROM_DATA_START != *size) {
>> +		tb_sw_warn(sw, "drom size mismatch\n");
>> +		goto err;
>> +	}
>> +
>> +	tb_sw_dbg(sw, "DROM version: %d\n", header->device_rom_revision);
>> +
>> +	switch (header->device_rom_revision) {
>> +	case 3:
>> +		res = usb4_drom_parse(sw);
>> +		break;
>> +	default:
>> +		tb_sw_warn(sw, "DROM device_rom_revision %#x unknown\n",
>> +			   header->device_rom_revision);
>> +		fallthrough;
>> +	case 1:
>> +		res = tb_drom_parse_v1(sw);
>> +		break;
>> +	}
>> +
>> +	if (res) {
>> +		tb_sw_warn(sw, "parsing DROM failed\n");
>> +		goto err;
>> +	}
>> +
>> +	return 0;
>> +err:
>> +	kfree(sw->drom);
>> +	sw->drom = NULL;
>> +	return -EIO;
>> +}
>> +
>>   /**
>>    * tb_drom_read() - Copy DROM to sw->drom and parse it
>>    * @sw: Router whose DROM to read and parse
>> @@ -602,8 +669,7 @@ static int usb4_drom_parse(struct tb_switch *sw)
>>   int tb_drom_read(struct tb_switch *sw)
>>   {
>>   	u16 size;
>> -	struct tb_drom_header *header;
>> -	int res, retries = 1;
>> +	int res;
>>   
>>   	if (sw->drom)
>>   		return 0;
>> @@ -614,11 +680,11 @@ int tb_drom_read(struct tb_switch *sw)
>>   		 * in a device property. Use it if available.
>>   		 */
>>   		if (tb_drom_copy_efi(sw, &size) == 0)
>> -			goto parse;
>> +			return tb_drom_parse(sw, &size);
>>   
>>   		/* Non-Apple hardware has the DROM as part of NVM */
>>   		if (tb_drom_copy_nvm(sw, &size) == 0)
>> -			goto parse;
>> +			return tb_drom_parse(sw, &size);
>>   
>>   		/*
>>   		 * USB4 hosts may support reading DROM through router
>> @@ -627,7 +693,7 @@ int tb_drom_read(struct tb_switch *sw)
>>   		if (tb_switch_is_usb4(sw)) {
>>   			usb4_switch_read_uid(sw, &sw->uid);
>>   			if (!usb4_copy_host_drom(sw, &size))
>> -				goto parse;
>> +				return tb_drom_parse(sw, &size);
>>   		} else {
>>   			/*
>>   			 * The root switch contains only a dummy drom
>> @@ -640,64 +706,13 @@ int tb_drom_read(struct tb_switch *sw)
>>   		return 0;
>>   	}
>>   
>> -	res = tb_drom_read_n(sw, 14, (u8 *) &size, 2);
>> +	/* TBT3 devices have the DROM as part of NVM */
>> +	if (sw->generation < 3)
> This is true for TBT2 devices too. I think you want to check for the
> sw->cap_lc here instead. If it is set the device has LC and therefore we
> can use the LC UUID registers to figure out the UUID in later stages.
> Otherwise we need to read it through bitbanging.
OK thanks, will adjust.
>> +		res = tb_drom_bit_bang(sw, &size);
>> +	else
>> +		res = tb_drom_copy_nvm(sw, &size);
>>   	if (res)
>>   		return res;
>> -	size &= 0x3ff;
>> -	size += TB_DROM_DATA_START;
>> -	tb_sw_dbg(sw, "reading drom (length: %#x)\n", size);
>> -	if (size < sizeof(*header)) {
>> -		tb_sw_warn(sw, "drom too small, aborting\n");
>> -		return -EIO;
>> -	}
>> -
>> -	sw->drom = kzalloc(size, GFP_KERNEL);
>> -	if (!sw->drom)
>> -		return -ENOMEM;
>> -read:
>> -	res = tb_drom_read_n(sw, 0, sw->drom, size);
>> -	if (res)
>> -		goto err;
>> -
>> -parse:
>> -	header = (void *) sw->drom;
>> -
>> -	if (header->data_len + TB_DROM_DATA_START != size) {
>> -		tb_sw_warn(sw, "drom size mismatch\n");
>> -		if (retries--) {
>> -			msleep(100);
>> -			goto read;
>> -		}
>> -		goto err;
>> -	}
>>   
>> -	tb_sw_dbg(sw, "DROM version: %d\n", header->device_rom_revision);
>> -
>> -	switch (header->device_rom_revision) {
>> -	case 3:
>> -		res = usb4_drom_parse(sw);
>> -		break;
>> -	default:
>> -		tb_sw_warn(sw, "DROM device_rom_revision %#x unknown\n",
>> -			   header->device_rom_revision);
>> -		fallthrough;
>> -	case 1:
>> -		res = tb_drom_parse(sw);
>> -		break;
>> -	}
>> -
>> -	/* If the DROM parsing fails, wait a moment and retry once */
>> -	if (res == -EILSEQ && retries--) {
>> -		tb_sw_warn(sw, "parsing DROM failed\n");
>> -		msleep(100);
>> -		goto read;
>> -	}
>> -
>> -	if (!res)
>> -		return 0;
>> -
>> -err:
>> -	kfree(sw->drom);
>> -	sw->drom = NULL;
>> -	return -EIO;
>> +	return tb_drom_parse(sw, &size);
>>   }
>> -- 
>> 2.34.1