[v5,13/13] cpu-exec: replace cpu->queued_work with GArray

Message ID	1470158864-17651-14-git-send-email-alex.bennee@linaro.org
State	New
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: domain of qemu-devel-bounces+patch=linaro.org@nongnu.org designates 2001:4830:134:3::11 as permitted sender) client-ip=2001:4830:134:3::11; From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org> To: mttcg@listserver.greensocs.com, qemu-devel@nongnu.org, fred.konrad@greensocs.com, a.rigo@virtualopensystems.com, serge.fdrv@gmail.com, cota@braap.org, bobby.prani@gmail.com Date: Tue, 2 Aug 2016 18:27:44 +0100 Message-Id: <1470158864-17651-14-git-send-email-alex.bennee@linaro.org> In-Reply-To: <1470158864-17651-1-git-send-email-alex.bennee@linaro.org> References: <1470158864-17651-1-git-send-email-alex.bennee@linaro.org> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Subject: [Qemu-devel] [PATCH v5 13/13] cpu-exec: replace cpu->queued_work with GArray Precedence: list Cc: peter.maydell@linaro.org, Peter Crosthwaite <crosthwaite.peter@gmail.com>, claudio.fontana@huawei.com, mark.burton@greensocs.com, jan.kiszka@siemens.com, pbonzini@redhat.com, =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>, rth@twiddle.net Errors-To: qemu-devel-bounces+patch=linaro.org@nongnu.org Sender: "Qemu-devel" <qemu-devel-bounces+patch=linaro.org@nongnu.org>

Message ID

1470158864-17651-14-git-send-email-alex.bennee@linaro.org

State

New

Headers

Received-SPF: pass (google.com: domain of
	qemu-devel-bounces+patch=linaro.org@nongnu.org designates
	2001:4830:134:3::11 as permitted sender)
	client-ip=2001:4830:134:3::11; 
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
To: mttcg@listserver.greensocs.com, qemu-devel@nongnu.org,
	fred.konrad@greensocs.com, a.rigo@virtualopensystems.com,
	serge.fdrv@gmail.com, cota@braap.org, bobby.prani@gmail.com
Date: Tue,  2 Aug 2016 18:27:44 +0100
Message-Id: <1470158864-17651-14-git-send-email-alex.bennee@linaro.org>
In-Reply-To: <1470158864-17651-1-git-send-email-alex.bennee@linaro.org>
References: <1470158864-17651-1-git-send-email-alex.bennee@linaro.org>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Subject: [Qemu-devel] [PATCH v5 13/13] cpu-exec: replace cpu->queued_work
	with GArray
Precedence: list
Cc: peter.maydell@linaro.org, Peter Crosthwaite <crosthwaite.peter@gmail.com>,
	claudio.fontana@huawei.com, mark.burton@greensocs.com,
	jan.kiszka@siemens.com, pbonzini@redhat.com,
	=?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>, rth@twiddle.net
Errors-To: qemu-devel-bounces+patch=linaro.org@nongnu.org
Sender: "Qemu-devel" <qemu-devel-bounces+patch=linaro.org@nongnu.org>

Commit Message

Alex Bennée Aug. 2, 2016, 5:27 p.m. UTC

Under times of high memory stress the additional small mallocs by a
linked list are source of potential memory fragmentation. As we have
worked hard to avoid mallocs elsewhere when queuing work we might as
well do the same for the list. We convert the lists to a auto-resizeing
GArray which will re-size in steps of powers of 2.

In theory the GArray could be mostly lockless but at the moment we keep
the locking scheme as before. However another advantage of having an array
means we can allocate a new one and process the old one without bouncing
the lock.

This will also be more cache friendly as we don't need to bounce between
cache lines as we work through the saved data.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

---
 cpu-exec-common.c | 109 +++++++++++++++++++++++++++++-------------------------
 cpus.c            |   2 +-
 include/qom/cpu.h |   6 +--
 3 files changed, 61 insertions(+), 56 deletions(-)

-- 
2.7.4

Comments

Alex Bennée Aug. 2, 2016, 5:42 p.m. UTC | #1

Alex Bennée <alex.bennee@linaro.org> writes:

> Under times of high memory stress the additional small mallocs by a

> linked list are source of potential memory fragmentation. As we have

> worked hard to avoid mallocs elsewhere when queuing work we might as

> well do the same for the list. We convert the lists to a auto-resizeing

> GArray which will re-size in steps of powers of 2.

<snip>
> diff --git a/cpu-exec-common.c b/cpu-exec-common.c

> index 6d5da15..745d973 100644

> --- a/cpu-exec-common.c

> +++ b/cpu-exec-common.c

> @@ -113,17 +113,18 @@ void wait_safe_cpu_work(void)

>  static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)

>  {

>      qemu_mutex_lock(&cpu->work_mutex);

> -    if (cpu->queued_work_first == NULL) {

> -        cpu->queued_work_first = wi;

> -    } else {

> -        cpu->queued_work_last->next = wi;

> +

> +    if (!cpu->queued_work) {

> +        cpu->queued_work = g_array_sized_new(true, true,

> +                             sizeof(struct qemu_work_item), 16);

>      }

> -    cpu->queued_work_last = wi;

> -    wi->next = NULL;

> -    wi->done = false;

> +    trace_queue_work_on_cpu(cpu->cpu_index, wi->safe,

> cpu->queued_work->len);


Oops, this was left over from testing, I've posted an updated version of
the patch.

> +

> +    g_array_append_val(cpu->queued_work, *wi);

>      if (wi->safe) {

>          atomic_inc(&safe_work_pending);

>      }

> +

>      qemu_mutex_unlock(&cpu->work_mutex);

>

>      if (!wi->safe) {

> @@ -138,6 +139,7 @@ static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)

>  void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)

>  {

>      struct qemu_work_item wi;

> +    bool done = false;

>

>      if (qemu_cpu_is_self(cpu)) {

>          func(cpu, data);

> @@ -146,11 +148,11 @@ void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)

>

>      wi.func = func;

>      wi.data = data;

> -    wi.free = false;

>      wi.safe = false;

> +    wi.done = &done;

>

>      queue_work_on_cpu(cpu, &wi);

> -    while (!atomic_mb_read(&wi.done)) {

> +    while (!atomic_mb_read(&done)) {

>          CPUState *self_cpu = current_cpu;

>

>          qemu_cond_wait(&qemu_work_cond, qemu_get_cpu_work_mutex());

> @@ -160,70 +162,75 @@ void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)

>

>  void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)

>  {

> -    struct qemu_work_item *wi;

> +    struct qemu_work_item wi;

>

>      if (qemu_cpu_is_self(cpu)) {

>          func(cpu, data);

>          return;

>      }

>

> -    wi = g_malloc0(sizeof(struct qemu_work_item));

> -    wi->func = func;

> -    wi->data = data;

> -    wi->free = true;

> -    wi->safe = false;

> +    wi.func = func;

> +    wi.data = data;

> +    wi.safe = false;

> +    wi.done = NULL;

>

> -    queue_work_on_cpu(cpu, wi);

> +    queue_work_on_cpu(cpu, &wi);

>  }

>

>  void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)

>  {

> -    struct qemu_work_item *wi;

> +    struct qemu_work_item wi;

>

> -    wi = g_malloc0(sizeof(struct qemu_work_item));

> -    wi->func = func;

> -    wi->data = data;

> -    wi->free = true;

> -    wi->safe = true;

> +    wi.func = func;

> +    wi.data = data;

> +    wi.safe = true;

> +    wi.done = NULL;

>

> -    queue_work_on_cpu(cpu, wi);

> +    queue_work_on_cpu(cpu, &wi);

>  }

>

>  void process_queued_cpu_work(CPUState *cpu)

>  {

>      struct qemu_work_item *wi;

> -

> -    if (cpu->queued_work_first == NULL) {

> -        return;

> -    }

> +    GArray *work_list = NULL;

> +    int i;

>

>      qemu_mutex_lock(&cpu->work_mutex);

> -    while (cpu->queued_work_first != NULL) {

> -        wi = cpu->queued_work_first;

> -        cpu->queued_work_first = wi->next;

> -        if (!cpu->queued_work_first) {

> -            cpu->queued_work_last = NULL;

> -        }

> -        if (wi->safe) {

> -            while (tcg_pending_threads) {

> -                qemu_cond_wait(&qemu_exclusive_cond,

> -                               qemu_get_cpu_work_mutex());

> +

> +    work_list = cpu->queued_work;

> +    cpu->queued_work = NULL;

> +

> +    qemu_mutex_unlock(&cpu->work_mutex);

> +

> +    if (work_list) {

> +

> +        g_assert(work_list->len > 0);

> +

> +        for (i = 0; i < work_list->len; i++) {

> +            wi = &g_array_index(work_list, struct qemu_work_item, i);

> +

> +            if (wi->safe) {

> +                while (tcg_pending_threads) {

> +                    qemu_cond_wait(&qemu_exclusive_cond,

> +                                   qemu_get_cpu_work_mutex());

> +                }

>              }

> -        }

> -        qemu_mutex_unlock(&cpu->work_mutex);

> -        wi->func(cpu, wi->data);

> -        qemu_mutex_lock(&cpu->work_mutex);

> -        if (wi->safe) {

> -            if (!atomic_dec_fetch(&safe_work_pending)) {

> -                qemu_cond_broadcast(&qemu_safe_work_cond);

> +

> +            wi->func(cpu, wi->data);

> +

> +            if (wi->safe) {

> +                if (!atomic_dec_fetch(&safe_work_pending)) {

> +                    qemu_cond_broadcast(&qemu_safe_work_cond);

> +                }

> +            }

> +

> +            if (wi->done) {

> +                atomic_mb_set(wi->done, true);

>              }

>          }

> -        if (wi->free) {

> -            g_free(wi);

> -        } else {

> -            atomic_mb_set(&wi->done, true);

> -        }

> +

> +        trace_process_queued_cpu_work(cpu->cpu_index,

> work_list->len);


And so was this.


--
Alex Bennée

Alex Bennée Aug. 3, 2016, 8:34 a.m. UTC | #2

Emilio G. Cota <cota@braap.org> writes:

> On Tue, Aug 02, 2016 at 18:27:44 +0100, Alex Bennée wrote:

>> Under times of high memory stress the additional small mallocs by a

>> linked list are source of potential memory fragmentation. As we have

>> worked hard to avoid mallocs elsewhere when queuing work we might as

>> well do the same for the list. We convert the lists to a auto-resizeing

>> GArray which will re-size in steps of powers of 2.

>

> Would be nice to see numbers on how this compares to simply using

> tcmalloc/jemalloc (or the glibc allocator, really).

glib just uses the glibc malloc/realloc underneath so it is all the same
allocator just a different usage pattern.

I was trying to find a decent way to measure the allocation usage and
fragmentation other than watching the differential in htop's memory
usage display with the two methods. Any ideas/suggestions?

>

> Thanks,

>

> 		Emilio

--
Alex Bennée

diff --git a/cpu-exec-common.c b/cpu-exec-common.c
index 6d5da15..745d973 100644
--- a/cpu-exec-common.c
+++ b/cpu-exec-common.c
@@ -113,17 +113,18 @@  void wait_safe_cpu_work(void)
 static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
 {
     qemu_mutex_lock(&cpu->work_mutex);
-    if (cpu->queued_work_first == NULL) {
-        cpu->queued_work_first = wi;
-    } else {
-        cpu->queued_work_last->next = wi;
+
+    if (!cpu->queued_work) {
+        cpu->queued_work = g_array_sized_new(true, true,
+                             sizeof(struct qemu_work_item), 16);
     }
-    cpu->queued_work_last = wi;
-    wi->next = NULL;
-    wi->done = false;
+    trace_queue_work_on_cpu(cpu->cpu_index, wi->safe, cpu->queued_work->len);
+
+    g_array_append_val(cpu->queued_work, *wi);
     if (wi->safe) {
         atomic_inc(&safe_work_pending);
     }
+
     qemu_mutex_unlock(&cpu->work_mutex);
 
     if (!wi->safe) {
@@ -138,6 +139,7 @@  static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
 {
     struct qemu_work_item wi;
+    bool done = false;
 
     if (qemu_cpu_is_self(cpu)) {
         func(cpu, data);
@@ -146,11 +148,11 @@  void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
 
     wi.func = func;
     wi.data = data;
-    wi.free = false;
     wi.safe = false;
+    wi.done = &done;
 
     queue_work_on_cpu(cpu, &wi);
-    while (!atomic_mb_read(&wi.done)) {
+    while (!atomic_mb_read(&done)) {
         CPUState *self_cpu = current_cpu;
 
         qemu_cond_wait(&qemu_work_cond, qemu_get_cpu_work_mutex());
@@ -160,70 +162,75 @@  void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
 
 void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
 {
-    struct qemu_work_item *wi;
+    struct qemu_work_item wi;
 
     if (qemu_cpu_is_self(cpu)) {
         func(cpu, data);
         return;
     }
 
-    wi = g_malloc0(sizeof(struct qemu_work_item));
-    wi->func = func;
-    wi->data = data;
-    wi->free = true;
-    wi->safe = false;
+    wi.func = func;
+    wi.data = data;
+    wi.safe = false;
+    wi.done = NULL;
 
-    queue_work_on_cpu(cpu, wi);
+    queue_work_on_cpu(cpu, &wi);
 }
 
 void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
 {
-    struct qemu_work_item *wi;
+    struct qemu_work_item wi;
 
-    wi = g_malloc0(sizeof(struct qemu_work_item));
-    wi->func = func;
-    wi->data = data;
-    wi->free = true;
-    wi->safe = true;
+    wi.func = func;
+    wi.data = data;
+    wi.safe = true;
+    wi.done = NULL;
 
-    queue_work_on_cpu(cpu, wi);
+    queue_work_on_cpu(cpu, &wi);
 }
 
 void process_queued_cpu_work(CPUState *cpu)
 {
     struct qemu_work_item *wi;
-
-    if (cpu->queued_work_first == NULL) {
-        return;
-    }
+    GArray *work_list = NULL;
+    int i;
 
     qemu_mutex_lock(&cpu->work_mutex);
-    while (cpu->queued_work_first != NULL) {
-        wi = cpu->queued_work_first;
-        cpu->queued_work_first = wi->next;
-        if (!cpu->queued_work_first) {
-            cpu->queued_work_last = NULL;
-        }
-        if (wi->safe) {
-            while (tcg_pending_threads) {
-                qemu_cond_wait(&qemu_exclusive_cond,
-                               qemu_get_cpu_work_mutex());
+
+    work_list = cpu->queued_work;
+    cpu->queued_work = NULL;
+
+    qemu_mutex_unlock(&cpu->work_mutex);
+
+    if (work_list) {
+
+        g_assert(work_list->len > 0);
+
+        for (i = 0; i < work_list->len; i++) {
+            wi = &g_array_index(work_list, struct qemu_work_item, i);
+
+            if (wi->safe) {
+                while (tcg_pending_threads) {
+                    qemu_cond_wait(&qemu_exclusive_cond,
+                                   qemu_get_cpu_work_mutex());
+                }
             }
-        }
-        qemu_mutex_unlock(&cpu->work_mutex);
-        wi->func(cpu, wi->data);
-        qemu_mutex_lock(&cpu->work_mutex);
-        if (wi->safe) {
-            if (!atomic_dec_fetch(&safe_work_pending)) {
-                qemu_cond_broadcast(&qemu_safe_work_cond);
+
+            wi->func(cpu, wi->data);
+
+            if (wi->safe) {
+                if (!atomic_dec_fetch(&safe_work_pending)) {
+                    qemu_cond_broadcast(&qemu_safe_work_cond);
+                }
+            }
+
+            if (wi->done) {
+                atomic_mb_set(wi->done, true);
             }
         }
-        if (wi->free) {
-            g_free(wi);
-        } else {
-            atomic_mb_set(&wi->done, true);
-        }
+
+        trace_process_queued_cpu_work(cpu->cpu_index, work_list->len);
+        qemu_cond_broadcast(&qemu_work_cond);
+        g_array_free(work_list, true);
     }
-    qemu_mutex_unlock(&cpu->work_mutex);
-    qemu_cond_broadcast(&qemu_work_cond);
 }
diff --git a/cpus.c b/cpus.c
index b712204..1ea60e4 100644
--- a/cpus.c
+++ b/cpus.c
@@ -88,7 +88,7 @@  bool cpu_is_stopped(CPUState *cpu)
 
 static bool cpu_thread_is_idle(CPUState *cpu)
 {
-    if (cpu->stop || cpu->queued_work_first) {
+    if (cpu->stop || cpu->queued_work) {
         return false;
     }
     if (cpu_is_stopped(cpu)) {
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index dee5ad0..060a473 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -235,11 +235,9 @@  struct kvm_run;
 typedef void (*run_on_cpu_func)(CPUState *cpu, void *data);
 
 struct qemu_work_item {
-    struct qemu_work_item *next;
     run_on_cpu_func func;
     void *data;
-    int done;
-    bool free;
+    bool *done;
     bool safe;
 };
 
@@ -318,7 +316,7 @@  struct CPUState {
     sigjmp_buf jmp_env;
 
     QemuMutex work_mutex;
-    struct qemu_work_item *queued_work_first, *queued_work_last;
+    GArray *queued_work;
 
     CPUAddressSpace *cpu_ases;
     int num_ases;

[v5,13/13] cpu-exec: replace cpu->queued_work with GArray

Commit Message

Comments

Patch