[Xen-devel,5/6] xen/arm: smccc: Add wrapper to automatically select the calling convention

Message ID 20180824165820.32620-6-julien.grall@arm.com
State Superseded
Headers show
Series
  • xen/arm: SMCCC fixup and improvement
Related show

Commit Message

Julien Grall Aug. 24, 2018, 4:58 p.m.
Signed-off-by: Julien Grall <julien.grall@arm.com>
---
 xen/arch/arm/psci.c              | 4 ++++
 xen/include/asm-arm/cpufeature.h | 3 ++-
 xen/include/asm-arm/smccc.h      | 8 ++++++++
 3 files changed, 14 insertions(+), 1 deletion(-)

Comments

Volodymyr Babchuk Aug. 27, 2018, 2:15 p.m. | #1
Hi Julien,

On 24.08.18 19:58, Julien Grall wrote:
> Signed-off-by: Julien Grall <julien.grall@arm.com>
> ---
>   xen/arch/arm/psci.c              | 4 ++++
>   xen/include/asm-arm/cpufeature.h | 3 ++-
>   xen/include/asm-arm/smccc.h      | 8 ++++++++
>   3 files changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/xen/arch/arm/psci.c b/xen/arch/arm/psci.c
> index 3cf5ecf0f3..941eec921b 100644
> --- a/xen/arch/arm/psci.c
> +++ b/xen/arch/arm/psci.c
> @@ -21,6 +21,7 @@
>   #include <xen/types.h>
>   #include <xen/mm.h>
>   #include <xen/smp.h>
> +#include <asm/cpufeature.h>
>   #include <asm/psci.h>
>   #include <asm/acpi.h>
>   
> @@ -118,6 +119,9 @@ static void __init psci_init_smccc(void)
>               smccc_ver = ret;
>       }
>   
> +    if ( smccc_ver >= SMCCC_VERSION(1, 1) )
> +        cpus_set_cap(ARM_SMCCC_1_1);
> +
>       printk(XENLOG_INFO "Using SMC Calling Convention v%u.%u\n",
>              SMCCC_VERSION_MAJOR(smccc_ver), SMCCC_VERSION_MINOR(smccc_ver));
>   }
> diff --git a/xen/include/asm-arm/cpufeature.h b/xen/include/asm-arm/cpufeature.h
> index 9c297c521c..c9c4046f5f 100644
> --- a/xen/include/asm-arm/cpufeature.h
> +++ b/xen/include/asm-arm/cpufeature.h
> @@ -44,8 +44,9 @@
>   #define SKIP_CTXT_SWITCH_SERROR_SYNC 6
>   #define ARM_HARDEN_BRANCH_PREDICTOR 7
>   #define ARM_SSBD 8
> +#define ARM_SMCCC_1_1 9
>   
> -#define ARM_NCAPS           9
> +#define ARM_NCAPS           10
>   
>   #ifndef __ASSEMBLY__
>   
> diff --git a/xen/include/asm-arm/smccc.h b/xen/include/asm-arm/smccc.h
> index 1ed6cbaa48..7c39c530e2 100644
> --- a/xen/include/asm-arm/smccc.h
> +++ b/xen/include/asm-arm/smccc.h
> @@ -213,6 +213,7 @@ struct arm_smccc_res {
>    */
>   #ifdef CONFIG_ARM_32
>   #define arm_smccc_1_0_smc(...) arm_smccc_1_1_smc(__VA_ARGS__)
> +#define arm_smccc_smc(...) arm_smccc_1_1_smc(__VA_ARGS__)
>   #else
>   
>   void __arm_smccc_1_0_smc(register_t a0, register_t a1, register_t a2,
> @@ -254,6 +255,13 @@ void __arm_smccc_1_0_smc(register_t a0, register_t a1, register_t a2,
>   #define arm_smccc_1_0_smc(...)                                              \
>           __arm_smccc_1_0_smc_count(__count_args(__VA_ARGS__), __VA_ARGS__)
>   
> +#define arm_smccc_smc(...)                                      \
> +    do {                                                        \
> +        if ( !cpus_have_const_cap(ARM_SMCCC_1_1) )              \
> +            arm_smccc_1_0_smc(__VA_ARGS__);                     \
> +        else                                                    \
> +            arm_smccc_1_1_smc(__VA_ARGS__);                     \
> +    } while ( 0 )
>   #endif /* CONFIG_ARM_64 */
>   
This will generate lots of code for every arm_smccc_smc(). Can we have 
function pointer arm_smccc_smc instead and assign it to either 
arm_smccc_1_1_smc() or arm_asmccc_1_0_smc() at boot?

I know that currently we have no function arm_smccc_1_1_smc() because it 
is being constructed inline every time. But we can write it explicitly 
and then have one indirect call to (maybe cached) function instead of
lots inlined code with conditional branches.

>   #endif /* __ASSEMBLY__ */
>
Julien Grall Aug. 27, 2018, 3:29 p.m. | #2
On 27/08/2018 15:15, Volodymyr Babchuk wrote:
> Hi Julien,

Hi,

> On 24.08.18 19:58, Julien Grall wrote:
>> Signed-off-by: Julien Grall <julien.grall@arm.com>
>> ---
>>   xen/arch/arm/psci.c              | 4 ++++
>>   xen/include/asm-arm/cpufeature.h | 3 ++-
>>   xen/include/asm-arm/smccc.h      | 8 ++++++++
>>   3 files changed, 14 insertions(+), 1 deletion(-)
>>
>> diff --git a/xen/arch/arm/psci.c b/xen/arch/arm/psci.c
>> index 3cf5ecf0f3..941eec921b 100644
>> --- a/xen/arch/arm/psci.c
>> +++ b/xen/arch/arm/psci.c
>> @@ -21,6 +21,7 @@
>>   #include <xen/types.h>
>>   #include <xen/mm.h>
>>   #include <xen/smp.h>
>> +#include <asm/cpufeature.h>
>>   #include <asm/psci.h>
>>   #include <asm/acpi.h>
>> @@ -118,6 +119,9 @@ static void __init psci_init_smccc(void)
>>               smccc_ver = ret;
>>       }
>> +    if ( smccc_ver >= SMCCC_VERSION(1, 1) )
>> +        cpus_set_cap(ARM_SMCCC_1_1);
>> +
>>       printk(XENLOG_INFO "Using SMC Calling Convention v%u.%u\n",
>>              SMCCC_VERSION_MAJOR(smccc_ver), 
>> SMCCC_VERSION_MINOR(smccc_ver));
>>   }
>> diff --git a/xen/include/asm-arm/cpufeature.h 
>> b/xen/include/asm-arm/cpufeature.h
>> index 9c297c521c..c9c4046f5f 100644
>> --- a/xen/include/asm-arm/cpufeature.h
>> +++ b/xen/include/asm-arm/cpufeature.h
>> @@ -44,8 +44,9 @@
>>   #define SKIP_CTXT_SWITCH_SERROR_SYNC 6
>>   #define ARM_HARDEN_BRANCH_PREDICTOR 7
>>   #define ARM_SSBD 8
>> +#define ARM_SMCCC_1_1 9
>> -#define ARM_NCAPS           9
>> +#define ARM_NCAPS           10
>>   #ifndef __ASSEMBLY__
>> diff --git a/xen/include/asm-arm/smccc.h b/xen/include/asm-arm/smccc.h
>> index 1ed6cbaa48..7c39c530e2 100644
>> --- a/xen/include/asm-arm/smccc.h
>> +++ b/xen/include/asm-arm/smccc.h
>> @@ -213,6 +213,7 @@ struct arm_smccc_res {
>>    */
>>   #ifdef CONFIG_ARM_32
>>   #define arm_smccc_1_0_smc(...) arm_smccc_1_1_smc(__VA_ARGS__)
>> +#define arm_smccc_smc(...) arm_smccc_1_1_smc(__VA_ARGS__)
>>   #else
>>   void __arm_smccc_1_0_smc(register_t a0, register_t a1, register_t a2,
>> @@ -254,6 +255,13 @@ void __arm_smccc_1_0_smc(register_t a0, 
>> register_t a1, register_t a2,
>>   #define 
>> arm_smccc_1_0_smc(...)                                              \
>>           __arm_smccc_1_0_smc_count(__count_args(__VA_ARGS__), 
>> __VA_ARGS__)
>> +#define arm_smccc_smc(...)                                      \
>> +    do {                                                        \
>> +        if ( !cpus_have_const_cap(ARM_SMCCC_1_1) )              \
>> +            arm_smccc_1_0_smc(__VA_ARGS__);                     \
>> +        else                                                    \
>> +            arm_smccc_1_1_smc(__VA_ARGS__);                     \
>> +    } while ( 0 )
>>   #endif /* CONFIG_ARM_64 */
> This will generate lots of code for every arm_smccc_smc(). Can we have 
> function pointer arm_smccc_smc instead and assign it to either 
> arm_smccc_1_1_smc() or arm_asmccc_1_0_smc() at boot?
> 
> I know that currently we have no function arm_smccc_1_1_smc() because it 
> is being constructed inline every time. But we can write it explicitly 
> and then have one indirect call to (maybe cached) function instead of
> lots inlined code with conditional branches.

This is indeed increasing the size of the function, but with a better 
performance when you perform an SMC with 1.1.

The main goal of SMCCC 1.1 is to limit the number of registers saved 
when calling an SMC. So if you provide provide a wrapper using a 
function, then you are better off sticking with SMCCC 1.0.

The idea of this code is to provide a faster call on the presence of 
SMCCC 1.1 while providing a fallback for other case. The function 
cpus_have_const_cap is implemented using an ALTERNATIVE (similar to 
static key) that will make if close to a NOP. I am saying close because 
this is not quite a static key as we don't have it in Xen. Instead, you 
replace a memory load by a constant.

Cheers,
Volodymyr Babchuk Aug. 27, 2018, 4:50 p.m. | #3
Hi,

On 27.08.18 18:29, Julien Grall wrote:
> 
> 
> On 27/08/2018 15:15, Volodymyr Babchuk wrote:
>> Hi Julien,
> 
> Hi,
> 
>> On 24.08.18 19:58, Julien Grall wrote:
>>> Signed-off-by: Julien Grall <julien.grall@arm.com>
>>> ---
>>>   xen/arch/arm/psci.c              | 4 ++++
>>>   xen/include/asm-arm/cpufeature.h | 3 ++-
>>>   xen/include/asm-arm/smccc.h      | 8 ++++++++
>>>   3 files changed, 14 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/xen/arch/arm/psci.c b/xen/arch/arm/psci.c
>>> index 3cf5ecf0f3..941eec921b 100644
>>> --- a/xen/arch/arm/psci.c
>>> +++ b/xen/arch/arm/psci.c
>>> @@ -21,6 +21,7 @@
>>>   #include <xen/types.h>
>>>   #include <xen/mm.h>
>>>   #include <xen/smp.h>
>>> +#include <asm/cpufeature.h>
>>>   #include <asm/psci.h>
>>>   #include <asm/acpi.h>
>>> @@ -118,6 +119,9 @@ static void __init psci_init_smccc(void)
>>>               smccc_ver = ret;
>>>       }
>>> +    if ( smccc_ver >= SMCCC_VERSION(1, 1) )
>>> +        cpus_set_cap(ARM_SMCCC_1_1);
>>> +
>>>       printk(XENLOG_INFO "Using SMC Calling Convention v%u.%u\n",
>>>              SMCCC_VERSION_MAJOR(smccc_ver), 
>>> SMCCC_VERSION_MINOR(smccc_ver));
>>>   }
>>> diff --git a/xen/include/asm-arm/cpufeature.h 
>>> b/xen/include/asm-arm/cpufeature.h
>>> index 9c297c521c..c9c4046f5f 100644
>>> --- a/xen/include/asm-arm/cpufeature.h
>>> +++ b/xen/include/asm-arm/cpufeature.h
>>> @@ -44,8 +44,9 @@
>>>   #define SKIP_CTXT_SWITCH_SERROR_SYNC 6
>>>   #define ARM_HARDEN_BRANCH_PREDICTOR 7
>>>   #define ARM_SSBD 8
>>> +#define ARM_SMCCC_1_1 9
>>> -#define ARM_NCAPS           9
>>> +#define ARM_NCAPS           10
>>>   #ifndef __ASSEMBLY__
>>> diff --git a/xen/include/asm-arm/smccc.h b/xen/include/asm-arm/smccc.h
>>> index 1ed6cbaa48..7c39c530e2 100644
>>> --- a/xen/include/asm-arm/smccc.h
>>> +++ b/xen/include/asm-arm/smccc.h
>>> @@ -213,6 +213,7 @@ struct arm_smccc_res {
>>>    */
>>>   #ifdef CONFIG_ARM_32
>>>   #define arm_smccc_1_0_smc(...) arm_smccc_1_1_smc(__VA_ARGS__)
>>> +#define arm_smccc_smc(...) arm_smccc_1_1_smc(__VA_ARGS__)
>>>   #else
>>>   void __arm_smccc_1_0_smc(register_t a0, register_t a1, register_t a2,
>>> @@ -254,6 +255,13 @@ void __arm_smccc_1_0_smc(register_t a0, 
>>> register_t a1, register_t a2,
>>>   #define 
>>> arm_smccc_1_0_smc(...)                                              \
>>>           __arm_smccc_1_0_smc_count(__count_args(__VA_ARGS__), 
>>> __VA_ARGS__)
>>> +#define arm_smccc_smc(...)                                      \
>>> +    do {                                                        \
>>> +        if ( !cpus_have_const_cap(ARM_SMCCC_1_1) )              \
>>> +            arm_smccc_1_0_smc(__VA_ARGS__);                     \
>>> +        else                                                    \
>>> +            arm_smccc_1_1_smc(__VA_ARGS__);                     \
>>> +    } while ( 0 )
>>>   #endif /* CONFIG_ARM_64 */
>> This will generate lots of code for every arm_smccc_smc(). Can we have 
>> function pointer arm_smccc_smc instead and assign it to either 
>> arm_smccc_1_1_smc() or arm_asmccc_1_0_smc() at boot?
>>
>> I know that currently we have no function arm_smccc_1_1_smc() because 
>> it is being constructed inline every time. But we can write it 
>> explicitly and then have one indirect call to (maybe cached) function 
>> instead of
>> lots inlined code with conditional branches.
> 
> This is indeed increasing the size of the function, but with a better 
> performance when you perform an SMC with 1.1.
> 
> The main goal of SMCCC 1.1 is to limit the number of registers saved 
> when calling an SMC. So if you provide provide a wrapper using a 
> function, then you are better off sticking with SMCCC 1.0.
> 
> The idea of this code is to provide a faster call on the presence of 
> SMCCC 1.1 while providing a fallback for other case. The function 
> cpus_have_const_cap is implemented using an ALTERNATIVE (similar to 
> static key) that will make if close to a NOP. I am saying close because 
> this is not quite a static key as we don't have it in Xen. Instead, you 
> replace a memory load by a constant.
Ah, yes, true.

I checked how static keys are working. Seems, they use asm goto feature. 
Now I think: can we optimize this more? Something like that:

#define arm_smccc_smc(...)
     do {

             asm goto (ALTERNATIVE(nop",
                                   "b %l[smccc_1_1]",
                                   ARM_SMCCC_1_1)); 

             arm_smccc_1_0_smc(__VA_ARGS__);
             break;
smccc_1_1:
             arm_smccc_1_1_smc(__VA_ARGS__);
    } while ( 0 )

This will save use additional conditional branch.
Julien Grall Aug. 28, 2018, 2:05 p.m. | #4
Hi Volodymyr,

On 27/08/18 17:50, Volodymyr Babchuk wrote:
> On 27.08.18 18:29, Julien Grall wrote:
>> On 27/08/2018 15:15, Volodymyr Babchuk wrote:
>>> On 24.08.18 19:58, Julien Grall wrote:
>>>> Signed-off-by: Julien Grall <julien.grall@arm.com>
>> This is indeed increasing the size of the function, but with a better 
>> performance when you perform an SMC with 1.1.
>>
>> The main goal of SMCCC 1.1 is to limit the number of registers saved 
>> when calling an SMC. So if you provide provide a wrapper using a 
>> function, then you are better off sticking with SMCCC 1.0.
>>
>> The idea of this code is to provide a faster call on the presence of 
>> SMCCC 1.1 while providing a fallback for other case. The function 
>> cpus_have_const_cap is implemented using an ALTERNATIVE (similar to 
>> static key) that will make if close to a NOP. I am saying close 
>> because this is not quite a static key as we don't have it in Xen. 
>> Instead, you replace a memory load by a constant.
> Ah, yes, true.
> 
> I checked how static keys are working. Seems, they use asm goto feature. 
> Now I think: can we optimize this more? Something like that:
> 
> #define arm_smccc_smc(...)
>      do {
> 
>              asm goto (ALTERNATIVE(nop",
>                                    "b %l[smccc_1_1]",
>                                    ARM_SMCCC_1_1));

You would need to list the label in GotoLabels (see 6.45.2.7 [1]).

>              arm_smccc_1_0_smc(__VA_ARGS__);
>              break;
> smccc_1_1:
The label will get redefined if you have multiple SMC call in the same 
function. We could possibly generate label based on the file/line.

>              arm_smccc_1_1_smc(__VA_ARGS__);
>     } while ( 0 )
> 
> This will save use additional conditional branch.
The code with this patch looks like:

mov x0, #0		mov x0, #1
cbz w0, label		cbz w0, label

The mov + conditional branch is likely to be negligible over the cost of 
switching to EL3. Overall, I am not really convinced that it would be 
worth the cost of open-coding it. I would prefer if we keep the call to 
cpus_have_const_cap() and see if it can be optimized.

I have looked at cpus_have_const_cap() and haven't found good way to 
optimize it with the current infrastructure in Xen. Feel free to suggest 
improvement.

Cheers,

[1] https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html
Volodymyr Babchuk Aug. 28, 2018, 2:40 p.m. | #5
Hi Julien,

On 28.08.18 17:05, Julien Grall wrote:
> Hi Volodymyr,
> 
> On 27/08/18 17:50, Volodymyr Babchuk wrote:
>> On 27.08.18 18:29, Julien Grall wrote:
>>> On 27/08/2018 15:15, Volodymyr Babchuk wrote:
>>>> On 24.08.18 19:58, Julien Grall wrote:
>>>>> Signed-off-by: Julien Grall <julien.grall@arm.com>
>>> This is indeed increasing the size of the function, but with a better 
>>> performance when you perform an SMC with 1.1.
>>>
>>> The main goal of SMCCC 1.1 is to limit the number of registers saved 
>>> when calling an SMC. So if you provide provide a wrapper using a 
>>> function, then you are better off sticking with SMCCC 1.0.
>>>
>>> The idea of this code is to provide a faster call on the presence of 
>>> SMCCC 1.1 while providing a fallback for other case. The function 
>>> cpus_have_const_cap is implemented using an ALTERNATIVE (similar to 
>>> static key) that will make if close to a NOP. I am saying close 
>>> because this is not quite a static key as we don't have it in Xen. 
>>> Instead, you replace a memory load by a constant.
>> Ah, yes, true.
>>
>> I checked how static keys are working. Seems, they use asm goto 
>> feature. Now I think: can we optimize this more? Something like that:
>>
>> #define arm_smccc_smc(...)
>>      do {
>>
>>              asm goto (ALTERNATIVE(nop",
>>                                    "b %l[smccc_1_1]",
>>                                    ARM_SMCCC_1_1));
> 
> You would need to list the label in GotoLabels (see 6.45.2.7 [1]).
Yes, but as you said, we can use __LINE__ there.

>>              arm_smccc_1_0_smc(__VA_ARGS__);
>>              break;
>> smccc_1_1:
> The label will get redefined if you have multiple SMC call in the same 
> function. We could possibly generate label based on the file/line.
> 
>>              arm_smccc_1_1_smc(__VA_ARGS__);
>>     } while ( 0 )
>>
>> This will save use additional conditional branch.
> The code with this patch looks like:
> 
> mov x0, #0        mov x0, #1
> cbz w0, label        cbz w0, label
> 
> The mov + conditional branch is likely to be negligible over the cost of 
> switching to EL3. Overall, I am not really convinced that it would be 
> worth the cost of open-coding it. I would prefer if we keep the call to 
> cpus_have_const_cap() and see if it can be optimized.
Okay. I'm fine with this. I just saw one more way to optimize and wanted
to share it with you. Anyways:

Reviewed-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>

> I have looked at cpus_have_const_cap() and haven't found good way to 
> optimize it with the current infrastructure in Xen. Feel free to suggest 
> improvement.

Another thing: maybe it is worth to branch to 1.0 code and leave 1.1 in 
a straight path of execution? This will save you one more instruction 
for SMCCC 1.1 call.
Julien Grall Aug. 28, 2018, 2:43 p.m. | #6
On 28/08/18 15:40, Volodymyr Babchuk wrote:
> Hi Julien,
> 
> On 28.08.18 17:05, Julien Grall wrote:
>> Hi Volodymyr,
>>
>> On 27/08/18 17:50, Volodymyr Babchuk wrote:
>>> On 27.08.18 18:29, Julien Grall wrote:
>>>> On 27/08/2018 15:15, Volodymyr Babchuk wrote:
>>>>> On 24.08.18 19:58, Julien Grall wrote:
>>>>>> Signed-off-by: Julien Grall <julien.grall@arm.com>
>>>> This is indeed increasing the size of the function, but with a 
>>>> better performance when you perform an SMC with 1.1.
>>>>
>>>> The main goal of SMCCC 1.1 is to limit the number of registers saved 
>>>> when calling an SMC. So if you provide provide a wrapper using a 
>>>> function, then you are better off sticking with SMCCC 1.0.
>>>>
>>>> The idea of this code is to provide a faster call on the presence of 
>>>> SMCCC 1.1 while providing a fallback for other case. The function 
>>>> cpus_have_const_cap is implemented using an ALTERNATIVE (similar to 
>>>> static key) that will make if close to a NOP. I am saying close 
>>>> because this is not quite a static key as we don't have it in Xen. 
>>>> Instead, you replace a memory load by a constant.
>>> Ah, yes, true.
>>>
>>> I checked how static keys are working. Seems, they use asm goto 
>>> feature. Now I think: can we optimize this more? Something like that:
>>>
>>> #define arm_smccc_smc(...)
>>>      do {
>>>
>>>              asm goto (ALTERNATIVE(nop",
>>>                                    "b %l[smccc_1_1]",
>>>                                    ARM_SMCCC_1_1));
>>
>> You would need to list the label in GotoLabels (see 6.45.2.7 [1]).
> Yes, but as you said, we can use __LINE__ there.
> 
>>>              arm_smccc_1_0_smc(__VA_ARGS__);
>>>              break;
>>> smccc_1_1:
>> The label will get redefined if you have multiple SMC call in the same 
>> function. We could possibly generate label based on the file/line.
>>
>>>              arm_smccc_1_1_smc(__VA_ARGS__);
>>>     } while ( 0 )
>>>
>>> This will save use additional conditional branch.
>> The code with this patch looks like:
>>
>> mov x0, #0        mov x0, #1
>> cbz w0, label        cbz w0, label
>>
>> The mov + conditional branch is likely to be negligible over the cost 
>> of switching to EL3. Overall, I am not really convinced that it would 
>> be worth the cost of open-coding it. I would prefer if we keep the 
>> call to cpus_have_const_cap() and see if it can be optimized.
> Okay. I'm fine with this. I just saw one more way to optimize and wanted
> to share it with you. Anyways:
> 
> Reviewed-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>

Thank you!

>> I have looked at cpus_have_const_cap() and haven't found good way to 
>> optimize it with the current infrastructure in Xen. Feel free to 
>> suggest improvement.
> 
> Another thing: maybe it is worth to branch to 1.0 code and leave 1.1 in 
> a straight path of execution? This will save you one more instruction 
> for SMCCC 1.1 call.

I am not sure to understand your suggestion here. Could you expand?

However, why SMCCC 1.1 should be in the straight path of execution?

Cheers,
Volodymyr Babchuk Aug. 28, 2018, 3:10 p.m. | #7
On 28.08.18 17:43, Julien Grall wrote:
[...]

> 
>>> I have looked at cpus_have_const_cap() and haven't found good way to 
>>> optimize it with the current infrastructure in Xen. Feel free to 
>>> suggest improvement.
>>
>> Another thing: maybe it is worth to branch to 1.0 code and leave 1.1 
>> in a straight path of execution? This will save you one more 
>> instruction for SMCCC 1.1 call.
> 
> I am not sure to understand your suggestion here. Could you expand?

+#define arm_smccc_smc(...)                                      \
+    do {                                                        \
+        if ( cpus_have_const_cap(ARM_SMCCC_1_1) )              \
+            arm_smccc_1_1_smc(__VA_ARGS__);                     \
+        else                                                    \
+            arm_smccc_1_0_smc(__VA_ARGS__);                     \
+    } while ( 0 )


> However, why SMCCC 1.1 should be in the straight path of execution?

It is easier to read - no negation in if(). Also, I think, branch 
predictor would be happy. At least, if the following is true:

" If the branch information is not contained in the BTAC, static branch 
prediction is used, whereby we assume the branch will be taken if the 
branch is a conditional backwards branch or not taken if the branch is a 
conditional forwards branch." [1]


[1] 
http://linuxkernelhacker.blogspot.com/2014/07/branch-prediction-logic-in-arm.html
Julien Grall Aug. 28, 2018, 3:27 p.m. | #8
Hi Volodymyr,

On 28/08/18 16:10, Volodymyr Babchuk wrote:
> 
> 
> On 28.08.18 17:43, Julien Grall wrote:
> [...]
> 
>>
>>>> I have looked at cpus_have_const_cap() and haven't found good way to 
>>>> optimize it with the current infrastructure in Xen. Feel free to 
>>>> suggest improvement.
>>>
>>> Another thing: maybe it is worth to branch to 1.0 code and leave 1.1 
>>> in a straight path of execution? This will save you one more 
>>> instruction for SMCCC 1.1 call.
>>
>> I am not sure to understand your suggestion here. Could you expand?
> 
> +#define arm_smccc_smc(...)                                      \
> +    do {                                                        \
> +        if ( cpus_have_const_cap(ARM_SMCCC_1_1) )              \
> +            arm_smccc_1_1_smc(__VA_ARGS__);                     \
> +        else                                                    \
> +            arm_smccc_1_0_smc(__VA_ARGS__);                     \
> +    } while ( 0 )
> 
> 
>> However, why SMCCC 1.1 should be in the straight path of execution?
> 
> It is easier to read - no negation in if(). 

I can do that. I will also probably add an unlikely in 
cpus_have_const_cap(...).

> Also, I think, branch 
> predictor would be happy. At least, if the following is true:
> 
> " If the branch information is not contained in the BTAC, static branch 
> prediction is used, whereby we assume the branch will be taken if the 
> branch is a conditional backwards branch or not taken if the branch is a 
> conditional forwards branch." [1]

The Arm Arm does not provide any details on the branch predictor 
implementation. An implementer is free to do whatever he wants and could 
design a branch predicator doing the opposite as this statement.

You also can't assume how the compiler will compile the code, it may end 
up to generate the else branch first because it is predicted to be taken 
more often. This is why GCC provide __builtin_expect (commonly used as 
unlikely/likely) to influence the compiler choice for branch prediction.

Cheers,
Volodymyr Babchuk Aug. 28, 2018, 3:50 p.m. | #9
Hi Julien,

On 28.08.18 18:27, Julien Grall wrote:
> Hi Volodymyr,
> 
> On 28/08/18 16:10, Volodymyr Babchuk wrote:
>>
>>
>> On 28.08.18 17:43, Julien Grall wrote:
>> [...]
>>
>>>
>>>>> I have looked at cpus_have_const_cap() and haven't found good way 
>>>>> to optimize it with the current infrastructure in Xen. Feel free to 
>>>>> suggest improvement.
>>>>
>>>> Another thing: maybe it is worth to branch to 1.0 code and leave 1.1 
>>>> in a straight path of execution? This will save you one more 
>>>> instruction for SMCCC 1.1 call.
>>>
>>> I am not sure to understand your suggestion here. Could you expand?
>>
>> +#define arm_smccc_smc(...)                                      \
>> +    do {                                                        \
>> +        if ( cpus_have_const_cap(ARM_SMCCC_1_1) )              \
>> +            arm_smccc_1_1_smc(__VA_ARGS__);                     \
>> +        else                                                    \
>> +            arm_smccc_1_0_smc(__VA_ARGS__);                     \
>> +    } while ( 0 )
>>
>>
>>> However, why SMCCC 1.1 should be in the straight path of execution?
>>
>> It is easier to read - no negation in if(). 
> 
> I can do that. I will also probably add an unlikely in 
> cpus_have_const_cap(...).
That would  be great.

> 
>> Also, I think, branch predictor would be happy. At least, if the 
>> following is true:
>>
>> " If the branch information is not contained in the BTAC, static 
>> branch prediction is used, whereby we assume the branch will be taken 
>> if the branch is a conditional backwards branch or not taken if the 
>> branch is a conditional forwards branch." [1]
> 
> The Arm Arm does not provide any details on the branch predictor 
> implementation. An implementer is free to do whatever he wants and could 
> design a branch predicator doing the opposite as this statement.
Generally speaking - yes. But, AFAIK, most static predictors behave in 
the way described above. Anyways, as you pointed below, better to leave 
hint for the compiler.

> 
> You also can't assume how the compiler will compile the code, it may end 
> up to generate the else branch first because it is predicted to be taken 
> more often. This is why GCC provide __builtin_expect (commonly used as 
> unlikely/likely) to influence the compiler choice for branch prediction.
Yes, this is the good point. So, you can add likely/unlikely not only in 
cpus_have_const_cap(...) but also in #define arm_smccc_smc(...)
Julien Grall Aug. 28, 2018, 3:57 p.m. | #10
On 28/08/18 16:50, Volodymyr Babchuk wrote:
> Hi Julien,

Hi,

> 
> On 28.08.18 18:27, Julien Grall wrote:
>> Hi Volodymyr,
>>
>> On 28/08/18 16:10, Volodymyr Babchuk wrote:
>>>
>>>
>>> On 28.08.18 17:43, Julien Grall wrote:
>>> [...]
>>>
>>>>
>>>>>> I have looked at cpus_have_const_cap() and haven't found good way 
>>>>>> to optimize it with the current infrastructure in Xen. Feel free 
>>>>>> to suggest improvement.
>>>>>
>>>>> Another thing: maybe it is worth to branch to 1.0 code and leave 
>>>>> 1.1 in a straight path of execution? This will save you one more 
>>>>> instruction for SMCCC 1.1 call.
>>>>
>>>> I am not sure to understand your suggestion here. Could you expand?
>>>
>>> +#define arm_smccc_smc(...)                                      \
>>> +    do {                                                        \
>>> +        if ( cpus_have_const_cap(ARM_SMCCC_1_1) )              \
>>> +            arm_smccc_1_1_smc(__VA_ARGS__);                     \
>>> +        else                                                    \
>>> +            arm_smccc_1_0_smc(__VA_ARGS__);                     \
>>> +    } while ( 0 )
>>>
>>>
>>>> However, why SMCCC 1.1 should be in the straight path of execution?
>>>
>>> It is easier to read - no negation in if(). 
>>
>> I can do that. I will also probably add an unlikely in 
>> cpus_have_const_cap(...).
> That would  be great.
> 
>>
>>> Also, I think, branch predictor would be happy. At least, if the 
>>> following is true:
>>>
>>> " If the branch information is not contained in the BTAC, static 
>>> branch prediction is used, whereby we assume the branch will be taken 
>>> if the branch is a conditional backwards branch or not taken if the 
>>> branch is a conditional forwards branch." [1]
>>
>> The Arm Arm does not provide any details on the branch predictor 
>> implementation. An implementer is free to do whatever he wants and 
>> could design a branch predicator doing the opposite as this statement.
> Generally speaking - yes. But, AFAIK, most static predictors behave in 
> the way described above. Anyways, as you pointed below, better to leave 
> hint for the compiler.

Spectre would not have existed if the branch predictor was so easy ;).

> 
>>
>> You also can't assume how the compiler will compile the code, it may 
>> end up to generate the else branch first because it is predicted to be 
>> taken more often. This is why GCC provide __builtin_expect (commonly 
>> used as unlikely/likely) to influence the compiler choice for branch 
>> prediction.
> Yes, this is the good point. So, you can add likely/unlikely not only in 
> cpus_have_const_cap(...) but also in #define arm_smccc_smc(...)

There are no need to have likely/unlikely in arm_smccc_smc if it is 
already present in cpus_have_const_cap.

Cheers,
Volodymyr Babchuk Aug. 30, 2018, 5:45 p.m. | #11
Hi Julien,


On 24.08.18 19:58, Julien Grall wrote:
> Signed-off-by: Julien Grall <julien.grall@arm.com>
> ---
>   xen/arch/arm/psci.c              | 4 ++++
>   xen/include/asm-arm/cpufeature.h | 3 ++-
>   xen/include/asm-arm/smccc.h      | 8 ++++++++
>   3 files changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/xen/arch/arm/psci.c b/xen/arch/arm/psci.c
> index 3cf5ecf0f3..941eec921b 100644
> --- a/xen/arch/arm/psci.c
> +++ b/xen/arch/arm/psci.c
> @@ -21,6 +21,7 @@
>   #include <xen/types.h>
>   #include <xen/mm.h>
>   #include <xen/smp.h>
> +#include <asm/cpufeature.h>
>   #include <asm/psci.h>
>   #include <asm/acpi.h>
>   
> @@ -118,6 +119,9 @@ static void __init psci_init_smccc(void)
>               smccc_ver = ret;
>       }
>   
> +    if ( smccc_ver >= SMCCC_VERSION(1, 1) )
> +        cpus_set_cap(ARM_SMCCC_1_1);
> +
>       printk(XENLOG_INFO "Using SMC Calling Convention v%u.%u\n",
>              SMCCC_VERSION_MAJOR(smccc_ver), SMCCC_VERSION_MINOR(smccc_ver));
>   }
> diff --git a/xen/include/asm-arm/cpufeature.h b/xen/include/asm-arm/cpufeature.h
> index 9c297c521c..c9c4046f5f 100644
> --- a/xen/include/asm-arm/cpufeature.h
> +++ b/xen/include/asm-arm/cpufeature.h
> @@ -44,8 +44,9 @@
>   #define SKIP_CTXT_SWITCH_SERROR_SYNC 6
>   #define ARM_HARDEN_BRANCH_PREDICTOR 7
>   #define ARM_SSBD 8
> +#define ARM_SMCCC_1_1 9
>   
> -#define ARM_NCAPS           9
> +#define ARM_NCAPS           10
>   
>   #ifndef __ASSEMBLY__
>   
> diff --git a/xen/include/asm-arm/smccc.h b/xen/include/asm-arm/smccc.h
> index 1ed6cbaa48..7c39c530e2 100644
> --- a/xen/include/asm-arm/smccc.h
> +++ b/xen/include/asm-arm/smccc.h

+#include <asm/cpufeature.h>

> @@ -213,6 +213,7 @@ struct arm_smccc_res {
>    */
>   #ifdef CONFIG_ARM_32
>   #define arm_smccc_1_0_smc(...) arm_smccc_1_1_smc(__VA_ARGS__)
> +#define arm_smccc_smc(...) arm_smccc_1_1_smc(__VA_ARGS__)
>   #else
>   
>   void __arm_smccc_1_0_smc(register_t a0, register_t a1, register_t a2,
> @@ -254,6 +255,13 @@ void __arm_smccc_1_0_smc(register_t a0, register_t a1, register_t a2,
>   #define arm_smccc_1_0_smc(...)                                              \
>           __arm_smccc_1_0_smc_count(__count_args(__VA_ARGS__), __VA_ARGS__)
>   
> +#define arm_smccc_smc(...)                                      \
> +    do {                                                        \
> +        if ( !cpus_have_const_cap(ARM_SMCCC_1_1) )              \
> +            arm_smccc_1_0_smc(__VA_ARGS__);                     \
> +        else                                                    \
> +            arm_smccc_1_1_smc(__VA_ARGS__);                     \
> +    } while ( 0 )
>   #endif /* CONFIG_ARM_64 */
>   
>   #endif /* __ASSEMBLY__ */
>

Patch

diff --git a/xen/arch/arm/psci.c b/xen/arch/arm/psci.c
index 3cf5ecf0f3..941eec921b 100644
--- a/xen/arch/arm/psci.c
+++ b/xen/arch/arm/psci.c
@@ -21,6 +21,7 @@ 
 #include <xen/types.h>
 #include <xen/mm.h>
 #include <xen/smp.h>
+#include <asm/cpufeature.h>
 #include <asm/psci.h>
 #include <asm/acpi.h>
 
@@ -118,6 +119,9 @@  static void __init psci_init_smccc(void)
             smccc_ver = ret;
     }
 
+    if ( smccc_ver >= SMCCC_VERSION(1, 1) )
+        cpus_set_cap(ARM_SMCCC_1_1);
+
     printk(XENLOG_INFO "Using SMC Calling Convention v%u.%u\n",
            SMCCC_VERSION_MAJOR(smccc_ver), SMCCC_VERSION_MINOR(smccc_ver));
 }
diff --git a/xen/include/asm-arm/cpufeature.h b/xen/include/asm-arm/cpufeature.h
index 9c297c521c..c9c4046f5f 100644
--- a/xen/include/asm-arm/cpufeature.h
+++ b/xen/include/asm-arm/cpufeature.h
@@ -44,8 +44,9 @@ 
 #define SKIP_CTXT_SWITCH_SERROR_SYNC 6
 #define ARM_HARDEN_BRANCH_PREDICTOR 7
 #define ARM_SSBD 8
+#define ARM_SMCCC_1_1 9
 
-#define ARM_NCAPS           9
+#define ARM_NCAPS           10
 
 #ifndef __ASSEMBLY__
 
diff --git a/xen/include/asm-arm/smccc.h b/xen/include/asm-arm/smccc.h
index 1ed6cbaa48..7c39c530e2 100644
--- a/xen/include/asm-arm/smccc.h
+++ b/xen/include/asm-arm/smccc.h
@@ -213,6 +213,7 @@  struct arm_smccc_res {
  */
 #ifdef CONFIG_ARM_32
 #define arm_smccc_1_0_smc(...) arm_smccc_1_1_smc(__VA_ARGS__)
+#define arm_smccc_smc(...) arm_smccc_1_1_smc(__VA_ARGS__)
 #else
 
 void __arm_smccc_1_0_smc(register_t a0, register_t a1, register_t a2,
@@ -254,6 +255,13 @@  void __arm_smccc_1_0_smc(register_t a0, register_t a1, register_t a2,
 #define arm_smccc_1_0_smc(...)                                              \
         __arm_smccc_1_0_smc_count(__count_args(__VA_ARGS__), __VA_ARGS__)
 
+#define arm_smccc_smc(...)                                      \
+    do {                                                        \
+        if ( !cpus_have_const_cap(ARM_SMCCC_1_1) )              \
+            arm_smccc_1_0_smc(__VA_ARGS__);                     \
+        else                                                    \
+            arm_smccc_1_1_smc(__VA_ARGS__);                     \
+    } while ( 0 )
 #endif /* CONFIG_ARM_64 */
 
 #endif /* __ASSEMBLY__ */