[3/4] accel/tcg: Add cluster number to TCG TB hash

Message ID 20190108163008.7006-4-peter.maydell@linaro.org
State Superseded
Headers show
Series
  • tcg: support heterogenous CPU clusters
Related show

Commit Message

Peter Maydell Jan. 8, 2019, 4:30 p.m.
Include the cluster number in the hash we use to look
up TBs. This is important because a TB that is valid
for one cluster at a given physical address and set
of CPU flags is not necessarily valid for another:
the two clusters may have different views of physical
memory, or may have different CPU features (eg FPU
present or absent).

We put the cluster number in the high 8 bits of the
TB cflags. This gives us up to 256 clusters, which should
be enough for anybody. If we ever need more, or need
more bits in cflags for other purposes, we could make
tb_hash_func() take more data (and expand qemu_xxhash7()
to qemu_xxhash8()).

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>

---
 include/exec/exec-all.h   | 4 +++-
 accel/tcg/cpu-exec.c      | 4 ++++
 accel/tcg/translate-all.c | 3 +++
 3 files changed, 10 insertions(+), 1 deletion(-)

-- 
2.19.2

Comments

Luc Michel Jan. 10, 2019, 3:14 p.m. | #1
On 1/8/19 5:30 PM, Peter Maydell wrote:
> Include the cluster number in the hash we use to look

> up TBs. This is important because a TB that is valid

> for one cluster at a given physical address and set

> of CPU flags is not necessarily valid for another:

> the two clusters may have different views of physical

> memory, or may have different CPU features (eg FPU

> present or absent).

> 

> We put the cluster number in the high 8 bits of the

> TB cflags. This gives us up to 256 clusters, which should

> be enough for anybody. If we ever need more, or need

> more bits in cflags for other purposes, we could make

> tb_hash_func() take more data (and expand qemu_xxhash7()

> to qemu_xxhash8()).

> 

> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>

Reviewed-by: Luc Michel <luc.michel@greensocs.com>

> ---

>  include/exec/exec-all.h   | 4 +++-

>  accel/tcg/cpu-exec.c      | 4 ++++

>  accel/tcg/translate-all.c | 3 +++

>  3 files changed, 10 insertions(+), 1 deletion(-)

> 

> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h

> index 815e5b1e838..aa7b81aaf01 100644

> --- a/include/exec/exec-all.h

> +++ b/include/exec/exec-all.h

> @@ -351,9 +351,11 @@ struct TranslationBlock {

>  #define CF_USE_ICOUNT  0x00020000

>  #define CF_INVALID     0x00040000 /* TB is stale. Set with @jmp_lock held */

>  #define CF_PARALLEL    0x00080000 /* Generate code for a parallel context */

> +#define CF_CLUSTER_MASK 0xff000000 /* Top 8 bits are cluster ID */

> +#define CF_CLUSTER_SHIFT 24

>  /* cflags' mask for hashing/comparison */

>  #define CF_HASH_MASK   \

> -    (CF_COUNT_MASK | CF_LAST_IO | CF_USE_ICOUNT | CF_PARALLEL)

> +    (CF_COUNT_MASK | CF_LAST_IO | CF_USE_ICOUNT | CF_PARALLEL | CF_CLUSTER_MASK)

>  

>      /* Per-vCPU dynamic tracing state used to generate this TB */

>      uint32_t trace_vcpu_dstate;

> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c

> index 870027d4359..e578a1a3aee 100644

> --- a/accel/tcg/cpu-exec.c

> +++ b/accel/tcg/cpu-exec.c

> @@ -336,6 +336,10 @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,

>          return NULL;

>      }

>      desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;

> +

> +    cf_mask &= ~CF_CLUSTER_MASK;

> +    cf_mask |= cpu->cluster_index << CF_CLUSTER_SHIFT;

> +

>      h = tb_hash_func(phys_pc, pc, flags, cf_mask, *cpu->trace_dstate);

>      return qht_lookup_custom(&tb_ctx.htable, &desc, h, tb_lookup_cmp);

>  }

> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c

> index 639f0b27287..ba27f5acc8c 100644

> --- a/accel/tcg/translate-all.c

> +++ b/accel/tcg/translate-all.c

> @@ -1692,6 +1692,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu,

>          cflags |= CF_NOCACHE | 1;

>      }

>  

> +    cflags &= ~CF_CLUSTER_MASK;

> +    cflags |= cpu->cluster_index << CF_CLUSTER_SHIFT;

> +

>   buffer_overflow:

>      tb = tb_alloc(pc);

>      if (unlikely(!tb)) {

>
Peter Maydell Jan. 10, 2019, 6:26 p.m. | #2
On Tue, 8 Jan 2019 at 16:30, Peter Maydell <peter.maydell@linaro.org> wrote:
>

> Include the cluster number in the hash we use to look

> up TBs. This is important because a TB that is valid

> for one cluster at a given physical address and set

> of CPU flags is not necessarily valid for another:

> the two clusters may have different views of physical

> memory, or may have different CPU features (eg FPU

> present or absent).

>

> We put the cluster number in the high 8 bits of the

> TB cflags. This gives us up to 256 clusters, which should

> be enough for anybody. If we ever need more, or need

> more bits in cflags for other purposes, we could make

> tb_hash_func() take more data (and expand qemu_xxhash7()

> to qemu_xxhash8()).

>

> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>

> ---

>  include/exec/exec-all.h   | 4 +++-

>  accel/tcg/cpu-exec.c      | 4 ++++

>  accel/tcg/translate-all.c | 3 +++

>  3 files changed, 10 insertions(+), 1 deletion(-)

>

> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h

> index 815e5b1e838..aa7b81aaf01 100644

> --- a/include/exec/exec-all.h

> +++ b/include/exec/exec-all.h

> @@ -351,9 +351,11 @@ struct TranslationBlock {

>  #define CF_USE_ICOUNT  0x00020000

>  #define CF_INVALID     0x00040000 /* TB is stale. Set with @jmp_lock held */

>  #define CF_PARALLEL    0x00080000 /* Generate code for a parallel context */

> +#define CF_CLUSTER_MASK 0xff000000 /* Top 8 bits are cluster ID */

> +#define CF_CLUSTER_SHIFT 24

>  /* cflags' mask for hashing/comparison */

>  #define CF_HASH_MASK   \

> -    (CF_COUNT_MASK | CF_LAST_IO | CF_USE_ICOUNT | CF_PARALLEL)

> +    (CF_COUNT_MASK | CF_LAST_IO | CF_USE_ICOUNT | CF_PARALLEL | CF_CLUSTER_MASK)

>

>      /* Per-vCPU dynamic tracing state used to generate this TB */

>      uint32_t trace_vcpu_dstate;

> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c

> index 870027d4359..e578a1a3aee 100644

> --- a/accel/tcg/cpu-exec.c

> +++ b/accel/tcg/cpu-exec.c

> @@ -336,6 +336,10 @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,

>          return NULL;

>      }

>      desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;

> +

> +    cf_mask &= ~CF_CLUSTER_MASK;

> +    cf_mask |= cpu->cluster_index << CF_CLUSTER_SHIFT;

> +


This hunk turns out not to be quite right -- it needs to move
to the top of the function, before the assignment
"desc.flags = flags;". Otherwise tb_lookup_cmp() will
spuriously fail, and execution becomes somewhat slower because
we have to keep retranslating TBs rather than reusing them.
(Surprisingly this is only noticeable in an ARM TFM image
I happen to have, not in Linux kernel boot...)

thanks
-- PMM
Peter Maydell Jan. 11, 2019, 1 p.m. | #3
On Fri, 11 Jan 2019 at 12:49, Aleksandar Markovic
<aleksandar.m.mail@gmail.com> wrote:
> 1. What would be, in more detail, if possible in layman terms,

> the "bad case" that this series fixes?


I describe this in the cover letter (which also has a link to
a tarball with a test case demonstrating it):
> TCG implicitly assumes that all CPUs are alike, because we have

> a single cache of generated TBs and we don't account for which

> CPU generated the code or is looking for the TB when adding or

> searching for generated TBs. This can go wrong in two situations:

> (1) two CPUs have different physical address spaces (eg CPU 1

> has one lot of RAM/ROM, and CPU 2 has different RAM/ROM): the

> physical address alone is then not sufficient to distinguish

> what code to run

> (2) two CPUs have different features (eg FPU

> vs no FPU): since our TCG frontends bake assumptions into the

> generated code about the presence/absence of features, if a

> CPU with FPU picks up a TB for one generated without an FPU

> it will behave wrongly


What happens is that CPU 1 picks up code that was generated
for CPU 2 and which is not correct for it, and thus does
not behave correctly. (In the test case, an instruction that
should UNDEF on the Cortex-R5F but not on the Cortex-A53 will
either UNDEF on the A53 or fail to UNDEF on the R5F, depending
on which CPU happened to get to the test code first.)

> 2. Let's suppose, hypothetically, and based on your example

> from one of commit messages from this series, that we want to

> support two multicore systems:

>     A. Cluster 1: 1 core with FPU; cluster 2: 3 cores without FPU

>     B. Cluster 1: 2 cores with FPU; cluster 2: 1 core without FPU

> Is there an apparatus that would allow the end user specify these

> and similar cpnfigurations through command line or acsimilar mean

> (so, without QEMU explicitely supporting such core organization,

> but supporting the single core in question, of course)?


The QEMU definition of "cluster" requires that all the CPUs
in the cluster must share (a) the same features (eg FPU)
and (b) the same view of physical memory -- this is what
defines that they are in the same cluster and not different
ones. So you'd model this as four clusters (assuming that
A and B have different views of physical memory. Otherwise
you could put all the with-FPU cores in one cluster and
the without-FPU cores in a second.)

Real hardware might choose to define what it calls a "cluster"
differently, but that doesn't matter.

> 3. Is there a possibility to have two layer clustering sheme,

> instead of one layer? Cluster/subcluster/core instead of

> cluster/core? For MIPS, there is a need for such organization.

> It looks to me 8 bits for cluster id, and 3 bits for subcluster

> id would be sufficient.


My view is that there is no need for the internal "cluster ID"
to match what the hardware happens to do with SMP CPU IDs
and NUMA architecture. What do you think we miss by this?
(Handling of NUMA architecture is a distinct bit of QEMU code,
unrelated to this.)

thanks
-- PMM
Aleksandar Markovic Jan. 11, 2019, 3:49 p.m. | #4
On Friday, January 11, 2019, Peter Maydell <peter.maydell@linaro.org> wrote:

> On Fri, 11 Jan 2019 at 12:49, Aleksandar Markovic

> <aleksandar.m.mail@gmail.com> wrote:

> > 1. What would be, in more detail, if possible in layman terms,

> > the "bad case" that this series fixes?

>

> I describe this in the cover letter (which also has a link to

> a tarball with a test case demonstrating it):

> > TCG implicitly assumes that all CPUs are alike, because we have

> > a single cache of generated TBs and we don't account for which

> > CPU generated the code or is looking for the TB when adding or

> > searching for generated TBs. This can go wrong in two situations:

> > (1) two CPUs have different physical address spaces (eg CPU 1

> > has one lot of RAM/ROM, and CPU 2 has different RAM/ROM): the

> > physical address alone is then not sufficient to distinguish

> > what code to run

> > (2) two CPUs have different features (eg FPU

> > vs no FPU): since our TCG frontends bake assumptions into the

> > generated code about the presence/absence of features, if a

> > CPU with FPU picks up a TB for one generated without an FPU

> > it will behave wrongly

>

> What happens is that CPU 1 picks up code that was generated

> for CPU 2 and which is not correct for it, and thus does

> not behave correctly. (In the test case, an instruction that

> should UNDEF on the Cortex-R5F but not on the Cortex-A53 will

> either UNDEF on the A53 or fail to UNDEF on the R5F, depending

> on which CPU happened to get to the test code first.)

>

>

Thanks, this example makes the intentions of the patch clearer to me.

If you don't mind, I may take a closer look at MIPS' (and perhaps some
other targets' a little) multi-core design details in few coming weeks, and
see if we could improve feightfulness of our emulation, or maybe make it
more flexible, or scalable.

Thanks again, and happy holiday season to all!!

Aleksandar




> > 2. Let's suppose, hypothetically, and based on your example

> > from one of commit messages from this series, that we want to

> > support two multicore systems:

> >     A. Cluster 1: 1 core with FPU; cluster 2: 3 cores without FPU

> >     B. Cluster 1: 2 cores with FPU; cluster 2: 1 core without FPU

> > Is there an apparatus that would allow the end user specify these

> > and similar cpnfigurations through command line or acsimilar mean

> > (so, without QEMU explicitely supporting such core organization,

> > but supporting the single core in question, of course)?

>

> The QEMU definition of "cluster" requires that all the CPUs

> in the cluster must share (a) the same features (eg FPU)

> and (b) the same view of physical memory -- this is what

> defines that they are in the same cluster and not different

> ones. So you'd model this as four clusters (assuming that

> A and B have different views of physical memory. Otherwise

> you could put all the with-FPU cores in one cluster and

> the without-FPU cores in a second.)

>

> Real hardware might choose to define what it calls a "cluster"

> differently, but that doesn't matter.

>

> > 3. Is there a possibility to have two layer clustering sheme,

> > instead of one layer? Cluster/subcluster/core instead of

> > cluster/core? For MIPS, there is a need for such organization.

> > It looks to me 8 bits for cluster id, and 3 bits for subcluster

> > id would be sufficient.

>

> My view is that there is no need for the internal "cluster ID"

> to match what the hardware happens to do with SMP CPU IDs

> and NUMA architecture. What do you think we miss by this?

> (Handling of NUMA architecture is a distinct bit of QEMU code,

> unrelated to this.)

>

> thanks

> -- PMM

>
<br><br>On Friday, January 11, 2019, Peter Maydell &lt;<a href="mailto:peter.maydell@linaro.org">peter.maydell@linaro.org</a>&gt; wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">On Fri, 11 Jan 2019 at 12:49, Aleksandar Markovic<br>
&lt;<a href="mailto:aleksandar.m.mail@gmail.com">aleksandar.m.mail@gmail.com</a>&gt; wrote:<br>
&gt; 1. What would be, in more detail, if possible in layman terms,<br>
&gt; the &quot;bad case&quot; that this series fixes?<br>
<br>
I describe this in the cover letter (which also has a link to<br>
a tarball with a test case demonstrating it):<br>
&gt; TCG implicitly assumes that all CPUs are alike, because we have<br>
&gt; a single cache of generated TBs and we don&#39;t account for which<br>
&gt; CPU generated the code or is looking for the TB when adding or<br>
&gt; searching for generated TBs. This can go wrong in two situations:<br>
&gt; (1) two CPUs have different physical address spaces (eg CPU 1<br>
&gt; has one lot of RAM/ROM, and CPU 2 has different RAM/ROM): the<br>
&gt; physical address alone is then not sufficient to distinguish<br>
&gt; what code to run<br>
&gt; (2) two CPUs have different features (eg FPU<br>
&gt; vs no FPU): since our TCG frontends bake assumptions into the<br>
&gt; generated code about the presence/absence of features, if a<br>
&gt; CPU with FPU picks up a TB for one generated without an FPU<br>
&gt; it will behave wrongly<br>
<br>
What happens is that CPU 1 picks up code that was generated<br>
for CPU 2 and which is not correct for it, and thus does<br>
not behave correctly. (In the test case, an instruction that<br>
should UNDEF on the Cortex-R5F but not on the Cortex-A53 will<br>
either UNDEF on the A53 or fail to UNDEF on the R5F, depending<br>
on which CPU happened to get to the test code first.)<br>
<br></blockquote><div><br></div><div>Thanks, this example makes the intentions of the patch clearer to me.</div><div><br></div><div>If you don&#39;t mind, I may take a closer look at MIPS&#39; (and perhaps some other targets&#39; a little) multi-core design details in few coming weeks, and see if we could improve feightfulness of our emulation, or maybe make it more flexible, or scalable.</div><div><br></div><div>Thanks again, and happy holiday season to all!!</div><div><br></div><div>Aleksandar</div><div><br></div><div><br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
&gt; 2. Let&#39;s suppose, hypothetically, and based on your example<br>
&gt; from one of commit messages from this series, that we want to<br>
&gt; support two multicore systems:<br>
&gt;     A. Cluster 1: 1 core with FPU; cluster 2: 3 cores without FPU<br>
&gt;     B. Cluster 1: 2 cores with FPU; cluster 2: 1 core without FPU<br>
&gt; Is there an apparatus that would allow the end user specify these<br>
&gt; and similar cpnfigurations through command line or acsimilar mean<br>
&gt; (so, without QEMU explicitely supporting such core organization,<br>
&gt; but supporting the single core in question, of course)?<br>
<br>
The QEMU definition of &quot;cluster&quot; requires that all the CPUs<br>
in the cluster must share (a) the same features (eg FPU)<br>
and (b) the same view of physical memory -- this is what<br>
defines that they are in the same cluster and not different<br>
ones. So you&#39;d model this as four clusters (assuming that<br>
A and B have different views of physical memory. Otherwise<br>
you could put all the with-FPU cores in one cluster and<br>
the without-FPU cores in a second.)<br>
<br>
Real hardware might choose to define what it calls a &quot;cluster&quot;<br>
differently, but that doesn&#39;t matter.<br>
<br>
&gt; 3. Is there a possibility to have two layer clustering sheme,<br>
&gt; instead of one layer? Cluster/subcluster/core instead of<br>
&gt; cluster/core? For MIPS, there is a need for such organization.<br>
&gt; It looks to me 8 bits for cluster id, and 3 bits for subcluster<br>
&gt; id would be sufficient.<br>
<br>
My view is that there is no need for the internal &quot;cluster ID&quot;<br>
to match what the hardware happens to do with SMP CPU IDs<br>
and NUMA architecture. What do you think we miss by this?<br>
(Handling of NUMA architecture is a distinct bit of QEMU code,<br>
unrelated to this.)<br>
<br>
thanks<br>
-- PMM<br>
</blockquote>
Aleksandar Markovic Jan. 14, 2019, 1:08 a.m. | #5
On Tuesday, January 8, 2019, Peter Maydell <peter.maydell@linaro.org> wrote:

> Include the cluster number in the hash we use to look

> up TBs. This is important because a TB that is valid

> for one cluster at a given physical address and set

> of CPU flags is not necessarily valid for another:

> the two clusters may have different views of physical

> memory, or may have different CPU features (eg FPU

> present or absent).

>

>

Hi, Peter.

I do understand the definition of cluster_index in the sense of this
series. However, it looks to me that the term "cluster" is generally
overused in areas where we work. This may lead to some confusion for future
developers, and let me suggest some other name, like "tcg_cluster_index" or
"tcg_group_id", or "translation_group_id". Admitedly, they all sound ugly
to me too. But having the name that would clearly separate this id from too
generic "cluster_index" IMHO would save lots of time during potential
related future development.

(Needled to say that,  for example, we in MIPS, for multi-core sustems,
group cores in clusters, that actually do not have anything to do with
clusters in TCG sense...)

Sincerely,

Aleksandar



> We put the cluster number in the high 8 bits of the

> TB cflags. This gives us up to 256 clusters, which should

> be enough for anybody. If we ever need more, or need

> more bits in cflags for other purposes, we could make

> tb_hash_func() take more data (and expand qemu_xxhash7()

> to qemu_xxhash8()).

>

> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>

> ---

>  include/exec/exec-all.h   | 4 +++-

>  accel/tcg/cpu-exec.c      | 4 ++++

>  accel/tcg/translate-all.c | 3 +++

>  3 files changed, 10 insertions(+), 1 deletion(-)

>

> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h

> index 815e5b1e838..aa7b81aaf01 100644

> --- a/include/exec/exec-all.h

> +++ b/include/exec/exec-all.h

> @@ -351,9 +351,11 @@ struct TranslationBlock {

>  #define CF_USE_ICOUNT  0x00020000

>  #define CF_INVALID     0x00040000 /* TB is stale. Set with @jmp_lock held

> */

>  #define CF_PARALLEL    0x00080000 /* Generate code for a parallel context

> */

> +#define CF_CLUSTER_MASK 0xff000000 /* Top 8 bits are cluster ID */

> +#define CF_CLUSTER_SHIFT 24

>  /* cflags' mask for hashing/comparison */

>  #define CF_HASH_MASK   \

> -    (CF_COUNT_MASK | CF_LAST_IO | CF_USE_ICOUNT | CF_PARALLEL)

> +    (CF_COUNT_MASK | CF_LAST_IO | CF_USE_ICOUNT | CF_PARALLEL |

> CF_CLUSTER_MASK)

>

>      /* Per-vCPU dynamic tracing state used to generate this TB */

>      uint32_t trace_vcpu_dstate;

> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c

> index 870027d4359..e578a1a3aee 100644

> --- a/accel/tcg/cpu-exec.c

> +++ b/accel/tcg/cpu-exec.c

> @@ -336,6 +336,10 @@ TranslationBlock *tb_htable_lookup(CPUState *cpu,

> target_ulong pc,

>          return NULL;

>      }

>      desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;

> +

> +    cf_mask &= ~CF_CLUSTER_MASK;

> +    cf_mask |= cpu->cluster_index << CF_CLUSTER_SHIFT;

> +

>      h = tb_hash_func(phys_pc, pc, flags, cf_mask, *cpu->trace_dstate);

>      return qht_lookup_custom(&tb_ctx.htable, &desc, h, tb_lookup_cmp);

>  }

> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c

> index 639f0b27287..ba27f5acc8c 100644

> --- a/accel/tcg/translate-all.c

> +++ b/accel/tcg/translate-all.c

> @@ -1692,6 +1692,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu,

>          cflags |= CF_NOCACHE | 1;

>      }

>

> +    cflags &= ~CF_CLUSTER_MASK;

> +    cflags |= cpu->cluster_index << CF_CLUSTER_SHIFT;

> +

>   buffer_overflow:

>      tb = tb_alloc(pc);

>      if (unlikely(!tb)) {

> --

> 2.19.2

>

>

>
<br><br>On Tuesday, January 8, 2019, Peter Maydell &lt;<a href="mailto:peter.maydell@linaro.org">peter.maydell@linaro.org</a>&gt; wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Include the cluster number in the hash we use to look<br>
up TBs. This is important because a TB that is valid<br>
for one cluster at a given physical address and set<br>
of CPU flags is not necessarily valid for another:<br>
the two clusters may have different views of physical<br>
memory, or may have different CPU features (eg FPU<br>
present or absent).<br>
<br></blockquote><div><br></div><div>Hi, Peter.</div><div><br></div><div>I do understand the definition of cluster_index in the sense of this series. However, it looks to me that the term &quot;cluster&quot; is generally overused in areas where we work. This may lead to some confusion for future developers, and let me suggest some other name, like &quot;tcg_cluster_index&quot; or &quot;tcg_group_id&quot;, or &quot;translation_group_id&quot;. Admitedly, they all sound ugly to me too. But having the name that would clearly separate this id from too generic &quot;cluster_index&quot; IMHO would save lots of time during potential related future development.</div><div><br></div><div>(Needled to say that,  for example, we in MIPS, for multi-core sustems, group cores in clusters, that actually do not have anything to do with clusters in TCG sense...)</div><div><br></div><div>Sincerely,</div><div><br></div><div>Aleksandar</div><div><br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
We put the cluster number in the high 8 bits of the<br>
TB cflags. This gives us up to 256 clusters, which should<br>
be enough for anybody. If we ever need more, or need<br>
more bits in cflags for other purposes, we could make<br>
tb_hash_func() take more data (and expand qemu_xxhash7()<br>
to qemu_xxhash8()).<br>
<br>
Signed-off-by: Peter Maydell &lt;<a href="mailto:peter.maydell@linaro.org">peter.maydell@linaro.org</a>&gt;<br>

---<br>
 include/exec/exec-all.h   | 4 +++-<br>
 accel/tcg/cpu-exec.c      | 4 ++++<br>
 accel/tcg/translate-all.c | 3 +++<br>
 3 files changed, 10 insertions(+), 1 deletion(-)<br>
<br>
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h<br>
index 815e5b1e838..aa7b81aaf01 100644<br>
--- a/include/exec/exec-all.h<br>
+++ b/include/exec/exec-all.h<br>
@@ -351,9 +351,11 @@ struct TranslationBlock {<br>
 #define CF_USE_ICOUNT  0x00020000<br>
 #define CF_INVALID     0x00040000 /* TB is stale. Set with @jmp_lock held */<br>
 #define CF_PARALLEL    0x00080000 /* Generate code for a parallel context */<br>
+#define CF_CLUSTER_MASK 0xff000000 /* Top 8 bits are cluster ID */<br>
+#define CF_CLUSTER_SHIFT 24<br>
 /* cflags&#39; mask for hashing/comparison */<br>
 #define CF_HASH_MASK   \<br>
-    (CF_COUNT_MASK | CF_LAST_IO | CF_USE_ICOUNT | CF_PARALLEL)<br>
+    (CF_COUNT_MASK | CF_LAST_IO | CF_USE_ICOUNT | CF_PARALLEL | CF_CLUSTER_MASK)<br>
<br>
     /* Per-vCPU dynamic tracing state used to generate this TB */<br>
     uint32_t trace_vcpu_dstate;<br>
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c<br>
index 870027d4359..e578a1a3aee 100644<br>
--- a/accel/tcg/cpu-exec.c<br>
+++ b/accel/tcg/cpu-exec.c<br>
@@ -336,6 +336,10 @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,<br>
         return NULL;<br>
     }<br>
     desc.phys_page1 = phys_pc &amp; TARGET_PAGE_MASK;<br>
+<br>
+    cf_mask &amp;= ~CF_CLUSTER_MASK;<br>
+    cf_mask |= cpu-&gt;cluster_index &lt;&lt; CF_CLUSTER_SHIFT;<br>
+<br>
     h = tb_hash_func(phys_pc, pc, flags, cf_mask, *cpu-&gt;trace_dstate);<br>
     return qht_lookup_custom(&amp;tb_ctx.<wbr>htable, &amp;desc, h, tb_lookup_cmp);<br>
 }<br>
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c<br>
index 639f0b27287..ba27f5acc8c 100644<br>
--- a/accel/tcg/translate-all.c<br>
+++ b/accel/tcg/translate-all.c<br>
@@ -1692,6 +1692,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu,<br>
         cflags |= CF_NOCACHE | 1;<br>
     }<br>
<br>
+    cflags &amp;= ~CF_CLUSTER_MASK;<br>
+    cflags |= cpu-&gt;cluster_index &lt;&lt; CF_CLUSTER_SHIFT;<br>
+<br>
  buffer_overflow:<br>
     tb = tb_alloc(pc);<br>
     if (unlikely(!tb)) {<br>
-- <br>
2.19.2<br>
<br>
<br>
</blockquote>
Peter Maydell Jan. 14, 2019, 10:27 a.m. | #6
On Mon, 14 Jan 2019 at 01:08, Aleksandar Markovic
<aleksandar.m.mail@gmail.com> wrote:
>

>

>

> On Tuesday, January 8, 2019, Peter Maydell <peter.maydell@linaro.org> wrote:

>>

>> Include the cluster number in the hash we use to look

>> up TBs. This is important because a TB that is valid

>> for one cluster at a given physical address and set

>> of CPU flags is not necessarily valid for another:

>> the two clusters may have different views of physical

>> memory, or may have different CPU features (eg FPU

>> present or absent).


> I do understand the definition of cluster_index in the sense of this series. However, it looks to me that the term "cluster" is generally overused in areas where we work. This may lead to some confusion for future developers, and let me suggest some other name, like "tcg_cluster_index" or "tcg_group_id", or "translation_group_id". Admitedly, they all sound ugly to me too. But having the name that would clearly separate this id from too generic "cluster_index" IMHO would save lots of time during potential related future development.

>

> (Needled to say that,  for example, we in MIPS, for multi-core sustems, group cores in clusters, that actually do not have anything to do with clusters in TCG sense...)


Yeah, the term is a bit overloaded. Arm also has clusters
that are used more in the NUMA sense. However, the
term we have is what is in git master currently...

thanks
-- PMM
Peter Maydell Jan. 14, 2019, 11:53 a.m. | #7
On Mon, 14 Jan 2019 at 01:08, Aleksandar Markovic
<aleksandar.m.mail@gmail.com> wrote:
> I do understand the definition of cluster_index in the sense

> of this series. However, it looks to me that the term

> "cluster" is generally overused in areas where we work.

> This may lead to some confusion for future developers, and

> let me suggest some other name, like "tcg_cluster_index" or

> "tcg_group_id", or "translation_group_id".


Note also that the cluster index is not purely a TCG
concept -- it also (in master at the moment) affects
the gdbstub interface. Different clusters appear as
separate processes in gdb, whereas different CPUs in
the same cluster are different threads in the same CPU.
(And by default only the first cluster's CPUs will
appear, unless you explicitly attach to the second cluster:
this is a limitation of how gdb's UI handles multiple
processes.)

thanks
-- PMM

Patch

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 815e5b1e838..aa7b81aaf01 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -351,9 +351,11 @@  struct TranslationBlock {
 #define CF_USE_ICOUNT  0x00020000
 #define CF_INVALID     0x00040000 /* TB is stale. Set with @jmp_lock held */
 #define CF_PARALLEL    0x00080000 /* Generate code for a parallel context */
+#define CF_CLUSTER_MASK 0xff000000 /* Top 8 bits are cluster ID */
+#define CF_CLUSTER_SHIFT 24
 /* cflags' mask for hashing/comparison */
 #define CF_HASH_MASK   \
-    (CF_COUNT_MASK | CF_LAST_IO | CF_USE_ICOUNT | CF_PARALLEL)
+    (CF_COUNT_MASK | CF_LAST_IO | CF_USE_ICOUNT | CF_PARALLEL | CF_CLUSTER_MASK)
 
     /* Per-vCPU dynamic tracing state used to generate this TB */
     uint32_t trace_vcpu_dstate;
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index 870027d4359..e578a1a3aee 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -336,6 +336,10 @@  TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
         return NULL;
     }
     desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
+
+    cf_mask &= ~CF_CLUSTER_MASK;
+    cf_mask |= cpu->cluster_index << CF_CLUSTER_SHIFT;
+
     h = tb_hash_func(phys_pc, pc, flags, cf_mask, *cpu->trace_dstate);
     return qht_lookup_custom(&tb_ctx.htable, &desc, h, tb_lookup_cmp);
 }
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 639f0b27287..ba27f5acc8c 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -1692,6 +1692,9 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
         cflags |= CF_NOCACHE | 1;
     }
 
+    cflags &= ~CF_CLUSTER_MASK;
+    cflags |= cpu->cluster_index << CF_CLUSTER_SHIFT;
+
  buffer_overflow:
     tb = tb_alloc(pc);
     if (unlikely(!tb)) {