Message ID | 1464122781-11280-1-git-send-email-maxim.uvarov@linaro.org |
---|---|
State | Superseded |
Headers | show |
On Tue, May 24, 2016 at 3:46 PM, Maxim Uvarov <maxim.uvarov@linaro.org> wrote: > Make test a little bit simple. Add memory free and > take care about overflow using cast to int: > (int)odp_atomic_load_u32(consume_count) > Where number of consumer threads can dequeue from ring > and decrease atomic u32. > > Signed-off-by: Maxim Uvarov <maxim.uvarov@linaro.org> > Reviewed-and-tested-by: Bill Fischofer <bill.fischofer@linaro.org> > --- > platform/linux-generic/test/ring/ring_stress.c | 74 > ++++++++++++-------------- > 1 file changed, 34 insertions(+), 40 deletions(-) > > diff --git a/platform/linux-generic/test/ring/ring_stress.c > b/platform/linux-generic/test/ring/ring_stress.c > index c68419f..a7e89a8 100644 > --- a/platform/linux-generic/test/ring/ring_stress.c > +++ b/platform/linux-generic/test/ring/ring_stress.c > @@ -156,12 +156,11 @@ void ring_test_stress_N_M_producer_consumer(void) > consume_count = retrieve_consume_count(); > CU_ASSERT(consume_count != NULL); > > - /* in N:M test case, producer threads are always > - * greater or equal to consumer threads, thus produce > - * enought "goods" to be consumed by consumer threads. > + /* all producer threads try to fill ring to RING_SIZE, > + * while consumers threads dequeue from ring with PIECE_BULK > + * blocks. Multiply on 100 to add more tries. > */ > - odp_atomic_init_u32(consume_count, > - (worker_param.numthrds) / 2); > + odp_atomic_init_u32(consume_count, RING_SIZE / PIECE_BULK * 100); > > /* kick the workers */ > odp_cunit_thread_create(stress_worker, &worker_param); > @@ -202,8 +201,15 @@ static odp_atomic_u32_t *retrieve_consume_count(void) > /* worker function for multiple producer instances */ > static int do_producer(_ring_t *r) > { > - int i, result = 0; > + int i; > void **enq = NULL; > + odp_atomic_u32_t *consume_count; > + > + consume_count = retrieve_consume_count(); > + if (consume_count == NULL) { > + LOG_ERR("cannot retrieve expected consume count.\n"); > + return -1; > + } > > /* allocate dummy object pointers for enqueue */ > enq = malloc(PIECE_BULK * 2 * sizeof(void *)); > @@ -216,26 +222,28 @@ static int do_producer(_ring_t *r) > for (i = 0; i < PIECE_BULK; i++) > enq[i] = (void *)(unsigned long)i; > > - do { > - result = _ring_mp_enqueue_bulk(r, enq, PIECE_BULK); > - if (0 == result) { > - free(enq); > - return 0; > - } > - usleep(10); /* wait for consumer threads */ > - } while (!_ring_full(r)); > + while ((int)odp_atomic_load_u32(consume_count) > 0) { > + /* produce as much data as we can to the ring */ > + if (!_ring_mp_enqueue_bulk(r, enq, PIECE_BULK)) > + usleep(10); > + } > > + free(enq); > return 0; > } > > /* worker function for multiple consumer instances */ > static int do_consumer(_ring_t *r) > { > - int i, result = 0; > + int i; > void **deq = NULL; > - odp_atomic_u32_t *consume_count = NULL; > - const char *message = "test OK!"; > - const char *mismatch = "data mismatch..lockless enq/deq failed."; > + odp_atomic_u32_t *consume_count; > + > + consume_count = retrieve_consume_count(); > + if (consume_count == NULL) { > + LOG_ERR("cannot retrieve expected consume count.\n"); > + return -1; > + } > > /* allocate dummy object pointers for dequeue */ > deq = malloc(PIECE_BULK * 2 * sizeof(void *)); > @@ -244,31 +252,17 @@ static int do_consumer(_ring_t *r) > return 0; /* not failure, skip for insufficient memory */ > } > > - consume_count = retrieve_consume_count(); > - if (consume_count == NULL) { > - LOG_ERR("cannot retrieve expected consume count.\n"); > - return -1; > - } > - > - while (odp_atomic_load_u32(consume_count) > 0) { > - result = _ring_mc_dequeue_bulk(r, deq, PIECE_BULK); > - if (0 == result) { > - /* evaluate the data pattern */ > - for (i = 0; i < PIECE_BULK; i++) { > - if (deq[i] != (void *)(unsigned long)i) { > - result = -1; > - message = mismatch; > - break; > - } > - } > - > - free(deq); > - LOG_ERR("%s\n", message); > + while ((int)odp_atomic_load_u32(consume_count) > 0) { > + if (!_ring_mc_dequeue_bulk(r, deq, PIECE_BULK)) { > odp_atomic_dec_u32(consume_count); > - return result; > + > + /* evaluate the data pattern */ > + for (i = 0; i < PIECE_BULK; i++) > + CU_ASSERT(deq[i] == (void *)(unsigned > long)i); > } > - usleep(10); /* wait for producer threads */ > } > + > + free(deq); > return 0; > } > > -- > 2.7.1.250.gff4ea60 > > _______________________________________________ > lng-odp mailing list > lng-odp@lists.linaro.org > https://lists.linaro.org/mailman/listinfo/lng-odp >
Hi, sorry about the memory leak issue and thanks Maxim for your patch. Best Regards, Yi On 25 May 2016 at 09:05, Bill Fischofer <bill.fischofer@linaro.org> wrote: > On Tue, May 24, 2016 at 3:46 PM, Maxim Uvarov <maxim.uvarov@linaro.org> > wrote: > > > Make test a little bit simple. Add memory free and > > take care about overflow using cast to int: > > (int)odp_atomic_load_u32(consume_count) > > Where number of consumer threads can dequeue from ring > > and decrease atomic u32. > > > > Signed-off-by: Maxim Uvarov <maxim.uvarov@linaro.org> > > > > Reviewed-and-tested-by: Bill Fischofer <bill.fischofer@linaro.org> > > > > --- > > platform/linux-generic/test/ring/ring_stress.c | 74 > > ++++++++++++-------------- > > 1 file changed, 34 insertions(+), 40 deletions(-) > > > > diff --git a/platform/linux-generic/test/ring/ring_stress.c > > b/platform/linux-generic/test/ring/ring_stress.c > > index c68419f..a7e89a8 100644 > > --- a/platform/linux-generic/test/ring/ring_stress.c > > +++ b/platform/linux-generic/test/ring/ring_stress.c > > @@ -156,12 +156,11 @@ void ring_test_stress_N_M_producer_consumer(void) > > consume_count = retrieve_consume_count(); > > CU_ASSERT(consume_count != NULL); > > > > - /* in N:M test case, producer threads are always > > - * greater or equal to consumer threads, thus produce > > - * enought "goods" to be consumed by consumer threads. > > + /* all producer threads try to fill ring to RING_SIZE, > > + * while consumers threads dequeue from ring with PIECE_BULK > > + * blocks. Multiply on 100 to add more tries. > > */ > > - odp_atomic_init_u32(consume_count, > > - (worker_param.numthrds) / 2); > > + odp_atomic_init_u32(consume_count, RING_SIZE / PIECE_BULK * 100); > > > > /* kick the workers */ > > odp_cunit_thread_create(stress_worker, &worker_param); > > @@ -202,8 +201,15 @@ static odp_atomic_u32_t > *retrieve_consume_count(void) > > /* worker function for multiple producer instances */ > > static int do_producer(_ring_t *r) > > { > > - int i, result = 0; > > + int i; > > void **enq = NULL; > > + odp_atomic_u32_t *consume_count; > > + > > + consume_count = retrieve_consume_count(); > > + if (consume_count == NULL) { > > + LOG_ERR("cannot retrieve expected consume count.\n"); > > + return -1; > > + } > > > > /* allocate dummy object pointers for enqueue */ > > enq = malloc(PIECE_BULK * 2 * sizeof(void *)); > > @@ -216,26 +222,28 @@ static int do_producer(_ring_t *r) > > for (i = 0; i < PIECE_BULK; i++) > > enq[i] = (void *)(unsigned long)i; > > > > - do { > > - result = _ring_mp_enqueue_bulk(r, enq, PIECE_BULK); > > - if (0 == result) { > > - free(enq); > > - return 0; > > - } > > - usleep(10); /* wait for consumer threads */ > > - } while (!_ring_full(r)); > > + while ((int)odp_atomic_load_u32(consume_count) > 0) { > > + /* produce as much data as we can to the ring */ > > + if (!_ring_mp_enqueue_bulk(r, enq, PIECE_BULK)) > > + usleep(10); > > + } > > > > + free(enq); > > return 0; > > } > > > > /* worker function for multiple consumer instances */ > > static int do_consumer(_ring_t *r) > > { > > - int i, result = 0; > > + int i; > > void **deq = NULL; > > - odp_atomic_u32_t *consume_count = NULL; > > - const char *message = "test OK!"; > > - const char *mismatch = "data mismatch..lockless enq/deq failed."; > > + odp_atomic_u32_t *consume_count; > > + > > + consume_count = retrieve_consume_count(); > > + if (consume_count == NULL) { > > + LOG_ERR("cannot retrieve expected consume count.\n"); > > + return -1; > > + } > > > > /* allocate dummy object pointers for dequeue */ > > deq = malloc(PIECE_BULK * 2 * sizeof(void *)); > > @@ -244,31 +252,17 @@ static int do_consumer(_ring_t *r) > > return 0; /* not failure, skip for insufficient memory */ > > } > > > > - consume_count = retrieve_consume_count(); > > - if (consume_count == NULL) { > > - LOG_ERR("cannot retrieve expected consume count.\n"); > > - return -1; > > - } > > - > > - while (odp_atomic_load_u32(consume_count) > 0) { > > - result = _ring_mc_dequeue_bulk(r, deq, PIECE_BULK); > > - if (0 == result) { > > - /* evaluate the data pattern */ > > - for (i = 0; i < PIECE_BULK; i++) { > > - if (deq[i] != (void *)(unsigned long)i) { > > - result = -1; > > - message = mismatch; > > - break; > > - } > > - } > > - > > - free(deq); > > - LOG_ERR("%s\n", message); > > + while ((int)odp_atomic_load_u32(consume_count) > 0) { > > + if (!_ring_mc_dequeue_bulk(r, deq, PIECE_BULK)) { > > odp_atomic_dec_u32(consume_count); > > - return result; > > + > > + /* evaluate the data pattern */ > > + for (i = 0; i < PIECE_BULK; i++) > > + CU_ASSERT(deq[i] == (void *)(unsigned > > long)i); > > } > > - usleep(10); /* wait for producer threads */ > > } > > + > > + free(deq); > > return 0; > > } > > > > -- > > 2.7.1.250.gff4ea60 > > > > _______________________________________________ > > lng-odp mailing list > > lng-odp@lists.linaro.org > > https://lists.linaro.org/mailman/listinfo/lng-odp > > > _______________________________________________ > lng-odp mailing list > lng-odp@lists.linaro.org > https://lists.linaro.org/mailman/listinfo/lng-odp >
interesting that with clang it passes with gcc it hangs. Maxim. On 05/25/16 04:24, Yi He wrote: > Hi, sorry about the memory leak issue and thanks Maxim for your patch. > > Best Regards, Yi > > On 25 May 2016 at 09:05, Bill Fischofer <bill.fischofer@linaro.org > <mailto:bill.fischofer@linaro.org>> wrote: > > On Tue, May 24, 2016 at 3:46 PM, Maxim Uvarov > <maxim.uvarov@linaro.org <mailto:maxim.uvarov@linaro.org>> > wrote: > > > Make test a little bit simple. Add memory free and > > take care about overflow using cast to int: > > (int)odp_atomic_load_u32(consume_count) > > Where number of consumer threads can dequeue from ring > > and decrease atomic u32. > > > > Signed-off-by: Maxim Uvarov <maxim.uvarov@linaro.org > <mailto:maxim.uvarov@linaro.org>> > > > > Reviewed-and-tested-by: Bill Fischofer <bill.fischofer@linaro.org > <mailto:bill.fischofer@linaro.org>> > > > > --- > > platform/linux-generic/test/ring/ring_stress.c | 74 > > ++++++++++++-------------- > > 1 file changed, 34 insertions(+), 40 deletions(-) > > > > diff --git a/platform/linux-generic/test/ring/ring_stress.c > > b/platform/linux-generic/test/ring/ring_stress.c > > index c68419f..a7e89a8 100644 > > --- a/platform/linux-generic/test/ring/ring_stress.c > > +++ b/platform/linux-generic/test/ring/ring_stress.c > > @@ -156,12 +156,11 @@ void > ring_test_stress_N_M_producer_consumer(void) > > consume_count = retrieve_consume_count(); > > CU_ASSERT(consume_count != NULL); > > > > - /* in N:M test case, producer threads are always > > - * greater or equal to consumer threads, thus produce > > - * enought "goods" to be consumed by consumer threads. > > + /* all producer threads try to fill ring to RING_SIZE, > > + * while consumers threads dequeue from ring with PIECE_BULK > > + * blocks. Multiply on 100 to add more tries. > > */ > > - odp_atomic_init_u32(consume_count, > > - (worker_param.numthrds) / 2); > > + odp_atomic_init_u32(consume_count, RING_SIZE / > PIECE_BULK * 100); > > > > /* kick the workers */ > > odp_cunit_thread_create(stress_worker, &worker_param); > > @@ -202,8 +201,15 @@ static odp_atomic_u32_t > *retrieve_consume_count(void) > > /* worker function for multiple producer instances */ > > static int do_producer(_ring_t *r) > > { > > - int i, result = 0; > > + int i; > > void **enq = NULL; > > + odp_atomic_u32_t *consume_count; > > + > > + consume_count = retrieve_consume_count(); > > + if (consume_count == NULL) { > > + LOG_ERR("cannot retrieve expected consume > count.\n"); > > + return -1; > > + } > > > > /* allocate dummy object pointers for enqueue */ > > enq = malloc(PIECE_BULK * 2 * sizeof(void *)); > > @@ -216,26 +222,28 @@ static int do_producer(_ring_t *r) > > for (i = 0; i < PIECE_BULK; i++) > > enq[i] = (void *)(unsigned long)i; > > > > - do { > > - result = _ring_mp_enqueue_bulk(r, enq, PIECE_BULK); > > - if (0 == result) { > > - free(enq); > > - return 0; > > - } > > - usleep(10); /* wait for consumer threads */ > > - } while (!_ring_full(r)); > > + while ((int)odp_atomic_load_u32(consume_count) > 0) { > > + /* produce as much data as we can to the ring */ > > + if (!_ring_mp_enqueue_bulk(r, enq, PIECE_BULK)) > > + usleep(10); > > + } > > > > + free(enq); > > return 0; > > } > > > > /* worker function for multiple consumer instances */ > > static int do_consumer(_ring_t *r) > > { > > - int i, result = 0; > > + int i; > > void **deq = NULL; > > - odp_atomic_u32_t *consume_count = NULL; > > - const char *message = "test OK!"; > > - const char *mismatch = "data mismatch..lockless enq/deq > failed."; > > + odp_atomic_u32_t *consume_count; > > + > > + consume_count = retrieve_consume_count(); > > + if (consume_count == NULL) { > > + LOG_ERR("cannot retrieve expected consume > count.\n"); > > + return -1; > > + } > > > > /* allocate dummy object pointers for dequeue */ > > deq = malloc(PIECE_BULK * 2 * sizeof(void *)); > > @@ -244,31 +252,17 @@ static int do_consumer(_ring_t *r) > > return 0; /* not failure, skip for insufficient > memory */ > > } > > > > - consume_count = retrieve_consume_count(); > > - if (consume_count == NULL) { > > - LOG_ERR("cannot retrieve expected consume > count.\n"); > > - return -1; > > - } > > - > > - while (odp_atomic_load_u32(consume_count) > 0) { > > - result = _ring_mc_dequeue_bulk(r, deq, PIECE_BULK); > > - if (0 == result) { > > - /* evaluate the data pattern */ > > - for (i = 0; i < PIECE_BULK; i++) { > > - if (deq[i] != (void *)(unsigned > long)i) { > > - result = -1; > > - message = mismatch; > > - break; > > - } > > - } > > - > > - free(deq); > > - LOG_ERR("%s\n", message); > > + while ((int)odp_atomic_load_u32(consume_count) > 0) { > > + if (!_ring_mc_dequeue_bulk(r, deq, PIECE_BULK)) { > > odp_atomic_dec_u32(consume_count); > > - return result; > > + > > + /* evaluate the data pattern */ > > + for (i = 0; i < PIECE_BULK; i++) > > + CU_ASSERT(deq[i] == (void > *)(unsigned > > long)i); > > } > > - usleep(10); /* wait for producer threads */ > > } > > + > > + free(deq); > > return 0; > > } > > > > -- > > 2.7.1.250.gff4ea60 > > > > _______________________________________________ > > lng-odp mailing list > > lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> > > https://lists.linaro.org/mailman/listinfo/lng-odp > > > _______________________________________________ > lng-odp mailing list > lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> > https://lists.linaro.org/mailman/listinfo/lng-odp > >
issue is that -mcx16 and -O3 generates buggy code on gcc4.8 and gcc4.9, but works fine with gcc5.3 or on old gcc without optimization. I.e. with -O0. So 2 options: 1) change configure.ac to not add -mcx16 flags; 2) fix ring code to support -mcx16 on old compilers; I think we should go first path. Maxim. On 05/25/16 04:24, Yi He wrote: > Hi, sorry about the memory leak issue and thanks Maxim for your patch. > > Best Regards, Yi > > On 25 May 2016 at 09:05, Bill Fischofer <bill.fischofer@linaro.org > <mailto:bill.fischofer@linaro.org>> wrote: > > On Tue, May 24, 2016 at 3:46 PM, Maxim Uvarov > <maxim.uvarov@linaro.org <mailto:maxim.uvarov@linaro.org>> > wrote: > > > Make test a little bit simple. Add memory free and > > take care about overflow using cast to int: > > (int)odp_atomic_load_u32(consume_count) > > Where number of consumer threads can dequeue from ring > > and decrease atomic u32. > > > > Signed-off-by: Maxim Uvarov <maxim.uvarov@linaro.org > <mailto:maxim.uvarov@linaro.org>> > > > > Reviewed-and-tested-by: Bill Fischofer <bill.fischofer@linaro.org > <mailto:bill.fischofer@linaro.org>> > > > > --- > > platform/linux-generic/test/ring/ring_stress.c | 74 > > ++++++++++++-------------- > > 1 file changed, 34 insertions(+), 40 deletions(-) > > > > diff --git a/platform/linux-generic/test/ring/ring_stress.c > > b/platform/linux-generic/test/ring/ring_stress.c > > index c68419f..a7e89a8 100644 > > --- a/platform/linux-generic/test/ring/ring_stress.c > > +++ b/platform/linux-generic/test/ring/ring_stress.c > > @@ -156,12 +156,11 @@ void > ring_test_stress_N_M_producer_consumer(void) > > consume_count = retrieve_consume_count(); > > CU_ASSERT(consume_count != NULL); > > > > - /* in N:M test case, producer threads are always > > - * greater or equal to consumer threads, thus produce > > - * enought "goods" to be consumed by consumer threads. > > + /* all producer threads try to fill ring to RING_SIZE, > > + * while consumers threads dequeue from ring with PIECE_BULK > > + * blocks. Multiply on 100 to add more tries. > > */ > > - odp_atomic_init_u32(consume_count, > > - (worker_param.numthrds) / 2); > > + odp_atomic_init_u32(consume_count, RING_SIZE / > PIECE_BULK * 100); > > > > /* kick the workers */ > > odp_cunit_thread_create(stress_worker, &worker_param); > > @@ -202,8 +201,15 @@ static odp_atomic_u32_t > *retrieve_consume_count(void) > > /* worker function for multiple producer instances */ > > static int do_producer(_ring_t *r) > > { > > - int i, result = 0; > > + int i; > > void **enq = NULL; > > + odp_atomic_u32_t *consume_count; > > + > > + consume_count = retrieve_consume_count(); > > + if (consume_count == NULL) { > > + LOG_ERR("cannot retrieve expected consume > count.\n"); > > + return -1; > > + } > > > > /* allocate dummy object pointers for enqueue */ > > enq = malloc(PIECE_BULK * 2 * sizeof(void *)); > > @@ -216,26 +222,28 @@ static int do_producer(_ring_t *r) > > for (i = 0; i < PIECE_BULK; i++) > > enq[i] = (void *)(unsigned long)i; > > > > - do { > > - result = _ring_mp_enqueue_bulk(r, enq, PIECE_BULK); > > - if (0 == result) { > > - free(enq); > > - return 0; > > - } > > - usleep(10); /* wait for consumer threads */ > > - } while (!_ring_full(r)); > > + while ((int)odp_atomic_load_u32(consume_count) > 0) { > > + /* produce as much data as we can to the ring */ > > + if (!_ring_mp_enqueue_bulk(r, enq, PIECE_BULK)) > > + usleep(10); > > + } > > > > + free(enq); > > return 0; > > } > > > > /* worker function for multiple consumer instances */ > > static int do_consumer(_ring_t *r) > > { > > - int i, result = 0; > > + int i; > > void **deq = NULL; > > - odp_atomic_u32_t *consume_count = NULL; > > - const char *message = "test OK!"; > > - const char *mismatch = "data mismatch..lockless enq/deq > failed."; > > + odp_atomic_u32_t *consume_count; > > + > > + consume_count = retrieve_consume_count(); > > + if (consume_count == NULL) { > > + LOG_ERR("cannot retrieve expected consume > count.\n"); > > + return -1; > > + } > > > > /* allocate dummy object pointers for dequeue */ > > deq = malloc(PIECE_BULK * 2 * sizeof(void *)); > > @@ -244,31 +252,17 @@ static int do_consumer(_ring_t *r) > > return 0; /* not failure, skip for insufficient > memory */ > > } > > > > - consume_count = retrieve_consume_count(); > > - if (consume_count == NULL) { > > - LOG_ERR("cannot retrieve expected consume > count.\n"); > > - return -1; > > - } > > - > > - while (odp_atomic_load_u32(consume_count) > 0) { > > - result = _ring_mc_dequeue_bulk(r, deq, PIECE_BULK); > > - if (0 == result) { > > - /* evaluate the data pattern */ > > - for (i = 0; i < PIECE_BULK; i++) { > > - if (deq[i] != (void *)(unsigned > long)i) { > > - result = -1; > > - message = mismatch; > > - break; > > - } > > - } > > - > > - free(deq); > > - LOG_ERR("%s\n", message); > > + while ((int)odp_atomic_load_u32(consume_count) > 0) { > > + if (!_ring_mc_dequeue_bulk(r, deq, PIECE_BULK)) { > > odp_atomic_dec_u32(consume_count); > > - return result; > > + > > + /* evaluate the data pattern */ > > + for (i = 0; i < PIECE_BULK; i++) > > + CU_ASSERT(deq[i] == (void > *)(unsigned > > long)i); > > } > > - usleep(10); /* wait for producer threads */ > > } > > + > > + free(deq); > > return 0; > > } > > > > -- > > 2.7.1.250.gff4ea60 > > > > _______________________________________________ > > lng-odp mailing list > > lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> > > https://lists.linaro.org/mailman/listinfo/lng-odp > > > _______________________________________________ > lng-odp mailing list > lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> > https://lists.linaro.org/mailman/listinfo/lng-odp > >
On Wed, May 25, 2016 at 10:18 AM, Maxim Uvarov <maxim.uvarov@linaro.org> wrote: > issue is that -mcx16 and -O3 generates buggy code on gcc4.8 and gcc4.9, > but works fine with gcc5.3 or on old gcc without optimization. I.e. with > -O0. > So 2 options: > 1) change configure.ac to not add -mcx16 flags; > 2) fix ring code to support -mcx16 on old compilers; > > I think we should go first path. Agreed. We shouldn't be wasting time trying to fix/optimize around known GCC bugs. Is it safe to say that -mcx16 works for GCC 5 and higher or do we need to restrict to 5.3 and above? > > > Maxim. > > On 05/25/16 04:24, Yi He wrote: > >> Hi, sorry about the memory leak issue and thanks Maxim for your patch. >> >> Best Regards, Yi >> >> On 25 May 2016 at 09:05, Bill Fischofer <bill.fischofer@linaro.org >> <mailto:bill.fischofer@linaro.org>> wrote: >> >> On Tue, May 24, 2016 at 3:46 PM, Maxim Uvarov >> <maxim.uvarov@linaro.org <mailto:maxim.uvarov@linaro.org>> >> wrote: >> >> > Make test a little bit simple. Add memory free and >> > take care about overflow using cast to int: >> > (int)odp_atomic_load_u32(consume_count) >> > Where number of consumer threads can dequeue from ring >> > and decrease atomic u32. >> > >> > Signed-off-by: Maxim Uvarov <maxim.uvarov@linaro.org >> <mailto:maxim.uvarov@linaro.org>> >> > >> >> Reviewed-and-tested-by: Bill Fischofer <bill.fischofer@linaro.org >> <mailto:bill.fischofer@linaro.org>> >> >> >> >> > --- >> > platform/linux-generic/test/ring/ring_stress.c | 74 >> > ++++++++++++-------------- >> > 1 file changed, 34 insertions(+), 40 deletions(-) >> > >> > diff --git a/platform/linux-generic/test/ring/ring_stress.c >> > b/platform/linux-generic/test/ring/ring_stress.c >> > index c68419f..a7e89a8 100644 >> > --- a/platform/linux-generic/test/ring/ring_stress.c >> > +++ b/platform/linux-generic/test/ring/ring_stress.c >> > @@ -156,12 +156,11 @@ void >> ring_test_stress_N_M_producer_consumer(void) >> > consume_count = retrieve_consume_count(); >> > CU_ASSERT(consume_count != NULL); >> > >> > - /* in N:M test case, producer threads are always >> > - * greater or equal to consumer threads, thus produce >> > - * enought "goods" to be consumed by consumer threads. >> > + /* all producer threads try to fill ring to RING_SIZE, >> > + * while consumers threads dequeue from ring with PIECE_BULK >> > + * blocks. Multiply on 100 to add more tries. >> > */ >> > - odp_atomic_init_u32(consume_count, >> > - (worker_param.numthrds) / 2); >> > + odp_atomic_init_u32(consume_count, RING_SIZE / >> PIECE_BULK * 100); >> > >> > /* kick the workers */ >> > odp_cunit_thread_create(stress_worker, &worker_param); >> > @@ -202,8 +201,15 @@ static odp_atomic_u32_t >> *retrieve_consume_count(void) >> > /* worker function for multiple producer instances */ >> > static int do_producer(_ring_t *r) >> > { >> > - int i, result = 0; >> > + int i; >> > void **enq = NULL; >> > + odp_atomic_u32_t *consume_count; >> > + >> > + consume_count = retrieve_consume_count(); >> > + if (consume_count == NULL) { >> > + LOG_ERR("cannot retrieve expected consume >> count.\n"); >> > + return -1; >> > + } >> > >> > /* allocate dummy object pointers for enqueue */ >> > enq = malloc(PIECE_BULK * 2 * sizeof(void *)); >> > @@ -216,26 +222,28 @@ static int do_producer(_ring_t *r) >> > for (i = 0; i < PIECE_BULK; i++) >> > enq[i] = (void *)(unsigned long)i; >> > >> > - do { >> > - result = _ring_mp_enqueue_bulk(r, enq, PIECE_BULK); >> > - if (0 == result) { >> > - free(enq); >> > - return 0; >> > - } >> > - usleep(10); /* wait for consumer threads */ >> > - } while (!_ring_full(r)); >> > + while ((int)odp_atomic_load_u32(consume_count) > 0) { >> > + /* produce as much data as we can to the ring */ >> > + if (!_ring_mp_enqueue_bulk(r, enq, PIECE_BULK)) >> > + usleep(10); >> > + } >> > >> > + free(enq); >> > return 0; >> > } >> > >> > /* worker function for multiple consumer instances */ >> > static int do_consumer(_ring_t *r) >> > { >> > - int i, result = 0; >> > + int i; >> > void **deq = NULL; >> > - odp_atomic_u32_t *consume_count = NULL; >> > - const char *message = "test OK!"; >> > - const char *mismatch = "data mismatch..lockless enq/deq >> failed."; >> > + odp_atomic_u32_t *consume_count; >> > + >> > + consume_count = retrieve_consume_count(); >> > + if (consume_count == NULL) { >> > + LOG_ERR("cannot retrieve expected consume >> count.\n"); >> > + return -1; >> > + } >> > >> > /* allocate dummy object pointers for dequeue */ >> > deq = malloc(PIECE_BULK * 2 * sizeof(void *)); >> > @@ -244,31 +252,17 @@ static int do_consumer(_ring_t *r) >> > return 0; /* not failure, skip for insufficient >> memory */ >> > } >> > >> > - consume_count = retrieve_consume_count(); >> > - if (consume_count == NULL) { >> > - LOG_ERR("cannot retrieve expected consume >> count.\n"); >> > - return -1; >> > - } >> > - >> > - while (odp_atomic_load_u32(consume_count) > 0) { >> > - result = _ring_mc_dequeue_bulk(r, deq, PIECE_BULK); >> > - if (0 == result) { >> > - /* evaluate the data pattern */ >> > - for (i = 0; i < PIECE_BULK; i++) { >> > - if (deq[i] != (void *)(unsigned >> long)i) { >> > - result = -1; >> > - message = mismatch; >> > - break; >> > - } >> > - } >> > - >> > - free(deq); >> > - LOG_ERR("%s\n", message); >> > + while ((int)odp_atomic_load_u32(consume_count) > 0) { >> > + if (!_ring_mc_dequeue_bulk(r, deq, PIECE_BULK)) { >> > odp_atomic_dec_u32(consume_count); >> > - return result; >> > + >> > + /* evaluate the data pattern */ >> > + for (i = 0; i < PIECE_BULK; i++) >> > + CU_ASSERT(deq[i] == (void >> *)(unsigned >> > long)i); >> > } >> > - usleep(10); /* wait for producer threads */ >> > } >> > + >> > + free(deq); >> > return 0; >> > } >> > >> > -- >> > 2.7.1.250.gff4ea60 >> > >> > _______________________________________________ >> > lng-odp mailing list >> > lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> >> > https://lists.linaro.org/mailman/listinfo/lng-odp >> > >> _______________________________________________ >> lng-odp mailing list >> lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> >> https://lists.linaro.org/mailman/listinfo/lng-odp >> >> >> >
On 25 May 2016 at 17:18, Maxim Uvarov <maxim.uvarov@linaro.org> wrote: > issue is that -mcx16 and -O3 generates buggy code on gcc4.8 and gcc4.9, > but works fine with gcc5.3 or on old gcc without optimization. I.e. with > -O0. > So 2 options: > 1) change configure.ac to not add -mcx16 flags; > 2) fix ring code to support -mcx16 on old compilers; > I don't understand this one. If (ring) code works without -mcx16 (which enables cmpxchg16b instruction on x86), it should work also with the -mcx16 flag. I can't understand how the code on its own could be wrong. Maybe the compiler is doing something wrong this option is specified (e.g. code generation gets screwed up). > > I think we should go first path. > > > Maxim. > > On 05/25/16 04:24, Yi He wrote: > >> Hi, sorry about the memory leak issue and thanks Maxim for your patch. >> >> Best Regards, Yi >> >> On 25 May 2016 at 09:05, Bill Fischofer <bill.fischofer@linaro.org >> <mailto:bill.fischofer@linaro.org>> wrote: >> >> On Tue, May 24, 2016 at 3:46 PM, Maxim Uvarov >> <maxim.uvarov@linaro.org <mailto:maxim.uvarov@linaro.org>> >> wrote: >> >> > Make test a little bit simple. Add memory free and >> > take care about overflow using cast to int: >> > (int)odp_atomic_load_u32(consume_count) >> > Where number of consumer threads can dequeue from ring >> > and decrease atomic u32. >> > >> > Signed-off-by: Maxim Uvarov <maxim.uvarov@linaro.org >> <mailto:maxim.uvarov@linaro.org>> >> > >> >> Reviewed-and-tested-by: Bill Fischofer <bill.fischofer@linaro.org >> <mailto:bill.fischofer@linaro.org>> >> >> >> > --- >> > platform/linux-generic/test/ring/ring_stress.c | 74 >> > ++++++++++++-------------- >> > 1 file changed, 34 insertions(+), 40 deletions(-) >> > >> > diff --git a/platform/linux-generic/test/ring/ring_stress.c >> > b/platform/linux-generic/test/ring/ring_stress.c >> > index c68419f..a7e89a8 100644 >> > --- a/platform/linux-generic/test/ring/ring_stress.c >> > +++ b/platform/linux-generic/test/ring/ring_stress.c >> > @@ -156,12 +156,11 @@ void >> ring_test_stress_N_M_producer_consumer(void) >> > consume_count = retrieve_consume_count(); >> > CU_ASSERT(consume_count != NULL); >> > >> > - /* in N:M test case, producer threads are always >> > - * greater or equal to consumer threads, thus produce >> > - * enought "goods" to be consumed by consumer threads. >> > + /* all producer threads try to fill ring to RING_SIZE, >> > + * while consumers threads dequeue from ring with PIECE_BULK >> > + * blocks. Multiply on 100 to add more tries. >> > */ >> > - odp_atomic_init_u32(consume_count, >> > - (worker_param.numthrds) / 2); >> > + odp_atomic_init_u32(consume_count, RING_SIZE / >> PIECE_BULK * 100); >> > >> > /* kick the workers */ >> > odp_cunit_thread_create(stress_worker, &worker_param); >> > @@ -202,8 +201,15 @@ static odp_atomic_u32_t >> *retrieve_consume_count(void) >> > /* worker function for multiple producer instances */ >> > static int do_producer(_ring_t *r) >> > { >> > - int i, result = 0; >> > + int i; >> > void **enq = NULL; >> > + odp_atomic_u32_t *consume_count; >> > + >> > + consume_count = retrieve_consume_count(); >> > + if (consume_count == NULL) { >> > + LOG_ERR("cannot retrieve expected consume >> count.\n"); >> > + return -1; >> > + } >> > >> > /* allocate dummy object pointers for enqueue */ >> > enq = malloc(PIECE_BULK * 2 * sizeof(void *)); >> > @@ -216,26 +222,28 @@ static int do_producer(_ring_t *r) >> > for (i = 0; i < PIECE_BULK; i++) >> > enq[i] = (void *)(unsigned long)i; >> > >> > - do { >> > - result = _ring_mp_enqueue_bulk(r, enq, PIECE_BULK); >> > - if (0 == result) { >> > - free(enq); >> > - return 0; >> > - } >> > - usleep(10); /* wait for consumer threads */ >> > - } while (!_ring_full(r)); >> > + while ((int)odp_atomic_load_u32(consume_count) > 0) { >> > + /* produce as much data as we can to the ring */ >> > + if (!_ring_mp_enqueue_bulk(r, enq, PIECE_BULK)) >> > + usleep(10); >> > + } >> > >> > + free(enq); >> > return 0; >> > } >> > >> > /* worker function for multiple consumer instances */ >> > static int do_consumer(_ring_t *r) >> > { >> > - int i, result = 0; >> > + int i; >> > void **deq = NULL; >> > - odp_atomic_u32_t *consume_count = NULL; >> > - const char *message = "test OK!"; >> > - const char *mismatch = "data mismatch..lockless enq/deq >> failed."; >> > + odp_atomic_u32_t *consume_count; >> > + >> > + consume_count = retrieve_consume_count(); >> > + if (consume_count == NULL) { >> > + LOG_ERR("cannot retrieve expected consume >> count.\n"); >> > + return -1; >> > + } >> > >> > /* allocate dummy object pointers for dequeue */ >> > deq = malloc(PIECE_BULK * 2 * sizeof(void *)); >> > @@ -244,31 +252,17 @@ static int do_consumer(_ring_t *r) >> > return 0; /* not failure, skip for insufficient >> memory */ >> > } >> > >> > - consume_count = retrieve_consume_count(); >> > - if (consume_count == NULL) { >> > - LOG_ERR("cannot retrieve expected consume >> count.\n"); >> > - return -1; >> > - } >> > - >> > - while (odp_atomic_load_u32(consume_count) > 0) { >> > - result = _ring_mc_dequeue_bulk(r, deq, PIECE_BULK); >> > - if (0 == result) { >> > - /* evaluate the data pattern */ >> > - for (i = 0; i < PIECE_BULK; i++) { >> > - if (deq[i] != (void *)(unsigned >> long)i) { >> > - result = -1; >> > - message = mismatch; >> > - break; >> > - } >> > - } >> > - >> > - free(deq); >> > - LOG_ERR("%s\n", message); >> > + while ((int)odp_atomic_load_u32(consume_count) > 0) { >> > + if (!_ring_mc_dequeue_bulk(r, deq, PIECE_BULK)) { >> > odp_atomic_dec_u32(consume_count); >> > - return result; >> > + >> > + /* evaluate the data pattern */ >> > + for (i = 0; i < PIECE_BULK; i++) >> > + CU_ASSERT(deq[i] == (void >> *)(unsigned >> > long)i); >> > } >> > - usleep(10); /* wait for producer threads */ >> > } >> > + >> > + free(deq); >> > return 0; >> > } >> > >> > -- >> > 2.7.1.250.gff4ea60 >> > >> > _______________________________________________ >> > lng-odp mailing list >> > lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> >> > https://lists.linaro.org/mailman/listinfo/lng-odp >> > >> _______________________________________________ >> lng-odp mailing list >> lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> >> https://lists.linaro.org/mailman/listinfo/lng-odp >> >> >> > _______________________________________________ > lng-odp mailing list > lng-odp@lists.linaro.org > https://lists.linaro.org/mailman/listinfo/lng-odp >
On 26 May 2016 at 10:33, Ola Liljedahl <ola.liljedahl@linaro.org> wrote: > > > On 25 May 2016 at 17:18, Maxim Uvarov <maxim.uvarov@linaro.org> wrote: > >> issue is that -mcx16 and -O3 generates buggy code on gcc4.8 and gcc4.9, >> but works fine with gcc5.3 or on old gcc without optimization. I.e. with >> -O0. >> So 2 options: >> 1) change configure.ac to not add -mcx16 flags; >> 2) fix ring code to support -mcx16 on old compilers; >> > I don't understand this one. > If (ring) code works without -mcx16 (which enables cmpxchg16b instruction > on x86), it should work also with the -mcx16 flag. I can't understand how > the code on its own could be wrong. Maybe the compiler is doing something > wrong this option is specified (e.g. code generation gets screwed up). > > Ola, I just wrote facts which I see on my 14.10 Ubuntu with gcc4.8 and gcc4.9. And Bill has gcc5.3 and he is unable to reproduce that bug. Also it works well with clang. I can spend some time to installing gcc5.4, disassembling 2 versions and compare what is the difference but I think we should not waist time on that if everything works well with newer compiler version. Maxim. > >> >> I think we should go first path. >> >> >> Maxim. >> >> On 05/25/16 04:24, Yi He wrote: >> >>> Hi, sorry about the memory leak issue and thanks Maxim for your patch. >>> >>> Best Regards, Yi >>> >>> On 25 May 2016 at 09:05, Bill Fischofer <bill.fischofer@linaro.org >>> <mailto:bill.fischofer@linaro.org>> wrote: >>> >>> On Tue, May 24, 2016 at 3:46 PM, Maxim Uvarov >>> <maxim.uvarov@linaro.org <mailto:maxim.uvarov@linaro.org>> >>> wrote: >>> >>> > Make test a little bit simple. Add memory free and >>> > take care about overflow using cast to int: >>> > (int)odp_atomic_load_u32(consume_count) >>> > Where number of consumer threads can dequeue from ring >>> > and decrease atomic u32. >>> > >>> > Signed-off-by: Maxim Uvarov <maxim.uvarov@linaro.org >>> <mailto:maxim.uvarov@linaro.org>> >>> > >>> >>> Reviewed-and-tested-by: Bill Fischofer <bill.fischofer@linaro.org >>> <mailto:bill.fischofer@linaro.org>> >>> >>> >>> > --- >>> > platform/linux-generic/test/ring/ring_stress.c | 74 >>> > ++++++++++++-------------- >>> > 1 file changed, 34 insertions(+), 40 deletions(-) >>> > >>> > diff --git a/platform/linux-generic/test/ring/ring_stress.c >>> > b/platform/linux-generic/test/ring/ring_stress.c >>> > index c68419f..a7e89a8 100644 >>> > --- a/platform/linux-generic/test/ring/ring_stress.c >>> > +++ b/platform/linux-generic/test/ring/ring_stress.c >>> > @@ -156,12 +156,11 @@ void >>> ring_test_stress_N_M_producer_consumer(void) >>> > consume_count = retrieve_consume_count(); >>> > CU_ASSERT(consume_count != NULL); >>> > >>> > - /* in N:M test case, producer threads are always >>> > - * greater or equal to consumer threads, thus produce >>> > - * enought "goods" to be consumed by consumer threads. >>> > + /* all producer threads try to fill ring to RING_SIZE, >>> > + * while consumers threads dequeue from ring with >>> PIECE_BULK >>> > + * blocks. Multiply on 100 to add more tries. >>> > */ >>> > - odp_atomic_init_u32(consume_count, >>> > - (worker_param.numthrds) / 2); >>> > + odp_atomic_init_u32(consume_count, RING_SIZE / >>> PIECE_BULK * 100); >>> > >>> > /* kick the workers */ >>> > odp_cunit_thread_create(stress_worker, &worker_param); >>> > @@ -202,8 +201,15 @@ static odp_atomic_u32_t >>> *retrieve_consume_count(void) >>> > /* worker function for multiple producer instances */ >>> > static int do_producer(_ring_t *r) >>> > { >>> > - int i, result = 0; >>> > + int i; >>> > void **enq = NULL; >>> > + odp_atomic_u32_t *consume_count; >>> > + >>> > + consume_count = retrieve_consume_count(); >>> > + if (consume_count == NULL) { >>> > + LOG_ERR("cannot retrieve expected consume >>> count.\n"); >>> > + return -1; >>> > + } >>> > >>> > /* allocate dummy object pointers for enqueue */ >>> > enq = malloc(PIECE_BULK * 2 * sizeof(void *)); >>> > @@ -216,26 +222,28 @@ static int do_producer(_ring_t *r) >>> > for (i = 0; i < PIECE_BULK; i++) >>> > enq[i] = (void *)(unsigned long)i; >>> > >>> > - do { >>> > - result = _ring_mp_enqueue_bulk(r, enq, PIECE_BULK); >>> > - if (0 == result) { >>> > - free(enq); >>> > - return 0; >>> > - } >>> > - usleep(10); /* wait for consumer threads */ >>> > - } while (!_ring_full(r)); >>> > + while ((int)odp_atomic_load_u32(consume_count) > 0) { >>> > + /* produce as much data as we can to the ring */ >>> > + if (!_ring_mp_enqueue_bulk(r, enq, PIECE_BULK)) >>> > + usleep(10); >>> > + } >>> > >>> > + free(enq); >>> > return 0; >>> > } >>> > >>> > /* worker function for multiple consumer instances */ >>> > static int do_consumer(_ring_t *r) >>> > { >>> > - int i, result = 0; >>> > + int i; >>> > void **deq = NULL; >>> > - odp_atomic_u32_t *consume_count = NULL; >>> > - const char *message = "test OK!"; >>> > - const char *mismatch = "data mismatch..lockless enq/deq >>> failed."; >>> > + odp_atomic_u32_t *consume_count; >>> > + >>> > + consume_count = retrieve_consume_count(); >>> > + if (consume_count == NULL) { >>> > + LOG_ERR("cannot retrieve expected consume >>> count.\n"); >>> > + return -1; >>> > + } >>> > >>> > /* allocate dummy object pointers for dequeue */ >>> > deq = malloc(PIECE_BULK * 2 * sizeof(void *)); >>> > @@ -244,31 +252,17 @@ static int do_consumer(_ring_t *r) >>> > return 0; /* not failure, skip for insufficient >>> memory */ >>> > } >>> > >>> > - consume_count = retrieve_consume_count(); >>> > - if (consume_count == NULL) { >>> > - LOG_ERR("cannot retrieve expected consume >>> count.\n"); >>> > - return -1; >>> > - } >>> > - >>> > - while (odp_atomic_load_u32(consume_count) > 0) { >>> > - result = _ring_mc_dequeue_bulk(r, deq, PIECE_BULK); >>> > - if (0 == result) { >>> > - /* evaluate the data pattern */ >>> > - for (i = 0; i < PIECE_BULK; i++) { >>> > - if (deq[i] != (void *)(unsigned >>> long)i) { >>> > - result = -1; >>> > - message = mismatch; >>> > - break; >>> > - } >>> > - } >>> > - >>> > - free(deq); >>> > - LOG_ERR("%s\n", message); >>> > + while ((int)odp_atomic_load_u32(consume_count) > 0) { >>> > + if (!_ring_mc_dequeue_bulk(r, deq, PIECE_BULK)) { >>> > odp_atomic_dec_u32(consume_count); >>> > - return result; >>> > + >>> > + /* evaluate the data pattern */ >>> > + for (i = 0; i < PIECE_BULK; i++) >>> > + CU_ASSERT(deq[i] == (void >>> *)(unsigned >>> > long)i); >>> > } >>> > - usleep(10); /* wait for producer threads */ >>> > } >>> > + >>> > + free(deq); >>> > return 0; >>> > } >>> > >>> > -- >>> > 2.7.1.250.gff4ea60 >>> > >>> > _______________________________________________ >>> > lng-odp mailing list >>> > lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> >>> > https://lists.linaro.org/mailman/listinfo/lng-odp >>> > >>> _______________________________________________ >>> lng-odp mailing list >>> lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> >>> https://lists.linaro.org/mailman/listinfo/lng-odp >>> >>> >>> >> _______________________________________________ >> lng-odp mailing list >> lng-odp@lists.linaro.org >> https://lists.linaro.org/mailman/listinfo/lng-odp >> > >
On 26 May 2016 at 10:02, Maxim Uvarov <maxim.uvarov@linaro.org> wrote: > > > On 26 May 2016 at 10:33, Ola Liljedahl <ola.liljedahl@linaro.org> wrote: > >> >> >> On 25 May 2016 at 17:18, Maxim Uvarov <maxim.uvarov@linaro.org> wrote: >> >>> issue is that -mcx16 and -O3 generates buggy code on gcc4.8 and gcc4.9, >>> but works fine with gcc5.3 or on old gcc without optimization. I.e. with >>> -O0. >>> So 2 options: >>> 1) change configure.ac to not add -mcx16 flags; >>> 2) fix ring code to support -mcx16 on old compilers; >>> >> I don't understand this one. >> If (ring) code works without -mcx16 (which enables cmpxchg16b instruction >> on x86), it should work also with the -mcx16 flag. I can't understand how >> the code on its own could be wrong. Maybe the compiler is doing something >> wrong this option is specified (e.g. code generation gets screwed up). >> >> > Ola, I just wrote facts which I see on my 14.10 Ubuntu with gcc4.8 and > gcc4.9. And Bill has gcc5.3 and he is unable to reproduce that bug. Also it > works well with clang. I can spend some time to installing gcc5.4, > disassembling 2 versions and compare what is the difference but I think we > should not waist time on that if everything works well with newer compiler > version. > So you don't actually know that the ring code can be "fixed" (correction or work-around) in order to avoid this problem? I do agree that if we only see a problem with an older version of a specific compiler, then it is most likely caused by a bug in that compiler and the best thing for us is to either not support that compiler version at all or (in this case) avoid using a compiler option which triggers the problem. This is not what I questioned. You proposed (as one alternative) fixing the ring code, then you should have had some clue as to what kind of fix or workaround would be necessary, As any problem or weakness in the ring code wasn't obvious to me, I asked you about it. Maybe this was just a rhetorical trick. Also suggest an impossible or outrageous alternative so that we quicker select the simpler but perhaps more bitter pill that as the only choice wouldn't have been welcomed. > > > Maxim. > > >> >>> >>> I think we should go first path. >>> >>> >>> Maxim. >>> >>> On 05/25/16 04:24, Yi He wrote: >>> >>>> Hi, sorry about the memory leak issue and thanks Maxim for your patch. >>>> >>>> Best Regards, Yi >>>> >>>> On 25 May 2016 at 09:05, Bill Fischofer <bill.fischofer@linaro.org >>>> <mailto:bill.fischofer@linaro.org>> wrote: >>>> >>>> On Tue, May 24, 2016 at 3:46 PM, Maxim Uvarov >>>> <maxim.uvarov@linaro.org <mailto:maxim.uvarov@linaro.org>> >>>> wrote: >>>> >>>> > Make test a little bit simple. Add memory free and >>>> > take care about overflow using cast to int: >>>> > (int)odp_atomic_load_u32(consume_count) >>>> > Where number of consumer threads can dequeue from ring >>>> > and decrease atomic u32. >>>> > >>>> > Signed-off-by: Maxim Uvarov <maxim.uvarov@linaro.org >>>> <mailto:maxim.uvarov@linaro.org>> >>>> > >>>> >>>> Reviewed-and-tested-by: Bill Fischofer <bill.fischofer@linaro.org >>>> <mailto:bill.fischofer@linaro.org>> >>>> >>>> >>>> > --- >>>> > platform/linux-generic/test/ring/ring_stress.c | 74 >>>> > ++++++++++++-------------- >>>> > 1 file changed, 34 insertions(+), 40 deletions(-) >>>> > >>>> > diff --git a/platform/linux-generic/test/ring/ring_stress.c >>>> > b/platform/linux-generic/test/ring/ring_stress.c >>>> > index c68419f..a7e89a8 100644 >>>> > --- a/platform/linux-generic/test/ring/ring_stress.c >>>> > +++ b/platform/linux-generic/test/ring/ring_stress.c >>>> > @@ -156,12 +156,11 @@ void >>>> ring_test_stress_N_M_producer_consumer(void) >>>> > consume_count = retrieve_consume_count(); >>>> > CU_ASSERT(consume_count != NULL); >>>> > >>>> > - /* in N:M test case, producer threads are always >>>> > - * greater or equal to consumer threads, thus produce >>>> > - * enought "goods" to be consumed by consumer threads. >>>> > + /* all producer threads try to fill ring to RING_SIZE, >>>> > + * while consumers threads dequeue from ring with >>>> PIECE_BULK >>>> > + * blocks. Multiply on 100 to add more tries. >>>> > */ >>>> > - odp_atomic_init_u32(consume_count, >>>> > - (worker_param.numthrds) / 2); >>>> > + odp_atomic_init_u32(consume_count, RING_SIZE / >>>> PIECE_BULK * 100); >>>> > >>>> > /* kick the workers */ >>>> > odp_cunit_thread_create(stress_worker, &worker_param); >>>> > @@ -202,8 +201,15 @@ static odp_atomic_u32_t >>>> *retrieve_consume_count(void) >>>> > /* worker function for multiple producer instances */ >>>> > static int do_producer(_ring_t *r) >>>> > { >>>> > - int i, result = 0; >>>> > + int i; >>>> > void **enq = NULL; >>>> > + odp_atomic_u32_t *consume_count; >>>> > + >>>> > + consume_count = retrieve_consume_count(); >>>> > + if (consume_count == NULL) { >>>> > + LOG_ERR("cannot retrieve expected consume >>>> count.\n"); >>>> > + return -1; >>>> > + } >>>> > >>>> > /* allocate dummy object pointers for enqueue */ >>>> > enq = malloc(PIECE_BULK * 2 * sizeof(void *)); >>>> > @@ -216,26 +222,28 @@ static int do_producer(_ring_t *r) >>>> > for (i = 0; i < PIECE_BULK; i++) >>>> > enq[i] = (void *)(unsigned long)i; >>>> > >>>> > - do { >>>> > - result = _ring_mp_enqueue_bulk(r, enq, >>>> PIECE_BULK); >>>> > - if (0 == result) { >>>> > - free(enq); >>>> > - return 0; >>>> > - } >>>> > - usleep(10); /* wait for consumer threads */ >>>> > - } while (!_ring_full(r)); >>>> > + while ((int)odp_atomic_load_u32(consume_count) > 0) { >>>> > + /* produce as much data as we can to the ring */ >>>> > + if (!_ring_mp_enqueue_bulk(r, enq, PIECE_BULK)) >>>> > + usleep(10); >>>> > + } >>>> > >>>> > + free(enq); >>>> > return 0; >>>> > } >>>> > >>>> > /* worker function for multiple consumer instances */ >>>> > static int do_consumer(_ring_t *r) >>>> > { >>>> > - int i, result = 0; >>>> > + int i; >>>> > void **deq = NULL; >>>> > - odp_atomic_u32_t *consume_count = NULL; >>>> > - const char *message = "test OK!"; >>>> > - const char *mismatch = "data mismatch..lockless enq/deq >>>> failed."; >>>> > + odp_atomic_u32_t *consume_count; >>>> > + >>>> > + consume_count = retrieve_consume_count(); >>>> > + if (consume_count == NULL) { >>>> > + LOG_ERR("cannot retrieve expected consume >>>> count.\n"); >>>> > + return -1; >>>> > + } >>>> > >>>> > /* allocate dummy object pointers for dequeue */ >>>> > deq = malloc(PIECE_BULK * 2 * sizeof(void *)); >>>> > @@ -244,31 +252,17 @@ static int do_consumer(_ring_t *r) >>>> > return 0; /* not failure, skip for insufficient >>>> memory */ >>>> > } >>>> > >>>> > - consume_count = retrieve_consume_count(); >>>> > - if (consume_count == NULL) { >>>> > - LOG_ERR("cannot retrieve expected consume >>>> count.\n"); >>>> > - return -1; >>>> > - } >>>> > - >>>> > - while (odp_atomic_load_u32(consume_count) > 0) { >>>> > - result = _ring_mc_dequeue_bulk(r, deq, >>>> PIECE_BULK); >>>> > - if (0 == result) { >>>> > - /* evaluate the data pattern */ >>>> > - for (i = 0; i < PIECE_BULK; i++) { >>>> > - if (deq[i] != (void *)(unsigned >>>> long)i) { >>>> > - result = -1; >>>> > - message = mismatch; >>>> > - break; >>>> > - } >>>> > - } >>>> > - >>>> > - free(deq); >>>> > - LOG_ERR("%s\n", message); >>>> > + while ((int)odp_atomic_load_u32(consume_count) > 0) { >>>> > + if (!_ring_mc_dequeue_bulk(r, deq, PIECE_BULK)) { >>>> > odp_atomic_dec_u32(consume_count); >>>> > - return result; >>>> > + >>>> > + /* evaluate the data pattern */ >>>> > + for (i = 0; i < PIECE_BULK; i++) >>>> > + CU_ASSERT(deq[i] == (void >>>> *)(unsigned >>>> > long)i); >>>> > } >>>> > - usleep(10); /* wait for producer threads */ >>>> > } >>>> > + >>>> > + free(deq); >>>> > return 0; >>>> > } >>>> > >>>> > -- >>>> > 2.7.1.250.gff4ea60 >>>> > >>>> > _______________________________________________ >>>> > lng-odp mailing list >>>> > lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> >>>> > https://lists.linaro.org/mailman/listinfo/lng-odp >>>> > >>>> _______________________________________________ >>>> lng-odp mailing list >>>> lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> >>>> https://lists.linaro.org/mailman/listinfo/lng-odp >>>> >>>> >>>> >>> _______________________________________________ >>> lng-odp mailing list >>> lng-odp@lists.linaro.org >>> https://lists.linaro.org/mailman/listinfo/lng-odp >>> >> >> >
Hi, Maxim, Ola and Bill Is it good to have a further investigation in RC4 to understand this? I can do this since it seems introduced since my patch :), Maxim did you mention this also happens to the old ringtest program (hang issue)? Best Regards, Yi On 26 May 2016 at 16:15, Ola Liljedahl <ola.liljedahl@linaro.org> wrote: > > > On 26 May 2016 at 10:02, Maxim Uvarov <maxim.uvarov@linaro.org> wrote: > >> >> >> On 26 May 2016 at 10:33, Ola Liljedahl <ola.liljedahl@linaro.org> wrote: >> >>> >>> >>> On 25 May 2016 at 17:18, Maxim Uvarov <maxim.uvarov@linaro.org> wrote: >>> >>>> issue is that -mcx16 and -O3 generates buggy code on gcc4.8 and gcc4.9, >>>> but works fine with gcc5.3 or on old gcc without optimization. I.e. with >>>> -O0. >>>> So 2 options: >>>> 1) change configure.ac to not add -mcx16 flags; >>>> 2) fix ring code to support -mcx16 on old compilers; >>>> >>> I don't understand this one. >>> If (ring) code works without -mcx16 (which enables cmpxchg16b >>> instruction on x86), it should work also with the -mcx16 flag. I can't >>> understand how the code on its own could be wrong. Maybe the compiler is >>> doing something wrong this option is specified (e.g. code generation gets >>> screwed up). >>> >>> >> Ola, I just wrote facts which I see on my 14.10 Ubuntu with gcc4.8 and >> gcc4.9. And Bill has gcc5.3 and he is unable to reproduce that bug. Also it >> works well with clang. I can spend some time to installing gcc5.4, >> disassembling 2 versions and compare what is the difference but I think we >> should not waist time on that if everything works well with newer compiler >> version. >> > So you don't actually know that the ring code can be "fixed" (correction > or work-around) in order to avoid this problem? > > I do agree that if we only see a problem with an older version of a > specific compiler, then it is most likely caused by a bug in that compiler > and the best thing for us is to either not support that compiler version at > all or (in this case) avoid using a compiler option which triggers the > problem. This is not what I questioned. You proposed (as one alternative) > fixing the ring code, then you should have had some clue as to what kind of > fix or workaround would be necessary, As any problem or weakness in the > ring code wasn't obvious to me, I asked you about it. > > Maybe this was just a rhetorical trick. Also suggest an impossible or > outrageous alternative so that we quicker select the simpler but perhaps > more bitter pill that as the only choice wouldn't have been welcomed. > > >> >> >> Maxim. >> >> >>> >>>> >>>> I think we should go first path. >>>> >>>> >>>> Maxim. >>>> >>>> On 05/25/16 04:24, Yi He wrote: >>>> >>>>> Hi, sorry about the memory leak issue and thanks Maxim for your patch. >>>>> >>>>> Best Regards, Yi >>>>> >>>>> On 25 May 2016 at 09:05, Bill Fischofer <bill.fischofer@linaro.org >>>>> <mailto:bill.fischofer@linaro.org>> wrote: >>>>> >>>>> On Tue, May 24, 2016 at 3:46 PM, Maxim Uvarov >>>>> <maxim.uvarov@linaro.org <mailto:maxim.uvarov@linaro.org>> >>>>> wrote: >>>>> >>>>> > Make test a little bit simple. Add memory free and >>>>> > take care about overflow using cast to int: >>>>> > (int)odp_atomic_load_u32(consume_count) >>>>> > Where number of consumer threads can dequeue from ring >>>>> > and decrease atomic u32. >>>>> > >>>>> > Signed-off-by: Maxim Uvarov <maxim.uvarov@linaro.org >>>>> <mailto:maxim.uvarov@linaro.org>> >>>>> > >>>>> >>>>> Reviewed-and-tested-by: Bill Fischofer <bill.fischofer@linaro.org >>>>> <mailto:bill.fischofer@linaro.org>> >>>>> >>>>> >>>>> > --- >>>>> > platform/linux-generic/test/ring/ring_stress.c | 74 >>>>> > ++++++++++++-------------- >>>>> > 1 file changed, 34 insertions(+), 40 deletions(-) >>>>> > >>>>> > diff --git a/platform/linux-generic/test/ring/ring_stress.c >>>>> > b/platform/linux-generic/test/ring/ring_stress.c >>>>> > index c68419f..a7e89a8 100644 >>>>> > --- a/platform/linux-generic/test/ring/ring_stress.c >>>>> > +++ b/platform/linux-generic/test/ring/ring_stress.c >>>>> > @@ -156,12 +156,11 @@ void >>>>> ring_test_stress_N_M_producer_consumer(void) >>>>> > consume_count = retrieve_consume_count(); >>>>> > CU_ASSERT(consume_count != NULL); >>>>> > >>>>> > - /* in N:M test case, producer threads are always >>>>> > - * greater or equal to consumer threads, thus produce >>>>> > - * enought "goods" to be consumed by consumer threads. >>>>> > + /* all producer threads try to fill ring to RING_SIZE, >>>>> > + * while consumers threads dequeue from ring with >>>>> PIECE_BULK >>>>> > + * blocks. Multiply on 100 to add more tries. >>>>> > */ >>>>> > - odp_atomic_init_u32(consume_count, >>>>> > - (worker_param.numthrds) / 2); >>>>> > + odp_atomic_init_u32(consume_count, RING_SIZE / >>>>> PIECE_BULK * 100); >>>>> > >>>>> > /* kick the workers */ >>>>> > odp_cunit_thread_create(stress_worker, &worker_param); >>>>> > @@ -202,8 +201,15 @@ static odp_atomic_u32_t >>>>> *retrieve_consume_count(void) >>>>> > /* worker function for multiple producer instances */ >>>>> > static int do_producer(_ring_t *r) >>>>> > { >>>>> > - int i, result = 0; >>>>> > + int i; >>>>> > void **enq = NULL; >>>>> > + odp_atomic_u32_t *consume_count; >>>>> > + >>>>> > + consume_count = retrieve_consume_count(); >>>>> > + if (consume_count == NULL) { >>>>> > + LOG_ERR("cannot retrieve expected consume >>>>> count.\n"); >>>>> > + return -1; >>>>> > + } >>>>> > >>>>> > /* allocate dummy object pointers for enqueue */ >>>>> > enq = malloc(PIECE_BULK * 2 * sizeof(void *)); >>>>> > @@ -216,26 +222,28 @@ static int do_producer(_ring_t *r) >>>>> > for (i = 0; i < PIECE_BULK; i++) >>>>> > enq[i] = (void *)(unsigned long)i; >>>>> > >>>>> > - do { >>>>> > - result = _ring_mp_enqueue_bulk(r, enq, >>>>> PIECE_BULK); >>>>> > - if (0 == result) { >>>>> > - free(enq); >>>>> > - return 0; >>>>> > - } >>>>> > - usleep(10); /* wait for consumer threads */ >>>>> > - } while (!_ring_full(r)); >>>>> > + while ((int)odp_atomic_load_u32(consume_count) > 0) { >>>>> > + /* produce as much data as we can to the ring */ >>>>> > + if (!_ring_mp_enqueue_bulk(r, enq, PIECE_BULK)) >>>>> > + usleep(10); >>>>> > + } >>>>> > >>>>> > + free(enq); >>>>> > return 0; >>>>> > } >>>>> > >>>>> > /* worker function for multiple consumer instances */ >>>>> > static int do_consumer(_ring_t *r) >>>>> > { >>>>> > - int i, result = 0; >>>>> > + int i; >>>>> > void **deq = NULL; >>>>> > - odp_atomic_u32_t *consume_count = NULL; >>>>> > - const char *message = "test OK!"; >>>>> > - const char *mismatch = "data mismatch..lockless enq/deq >>>>> failed."; >>>>> > + odp_atomic_u32_t *consume_count; >>>>> > + >>>>> > + consume_count = retrieve_consume_count(); >>>>> > + if (consume_count == NULL) { >>>>> > + LOG_ERR("cannot retrieve expected consume >>>>> count.\n"); >>>>> > + return -1; >>>>> > + } >>>>> > >>>>> > /* allocate dummy object pointers for dequeue */ >>>>> > deq = malloc(PIECE_BULK * 2 * sizeof(void *)); >>>>> > @@ -244,31 +252,17 @@ static int do_consumer(_ring_t *r) >>>>> > return 0; /* not failure, skip for insufficient >>>>> memory */ >>>>> > } >>>>> > >>>>> > - consume_count = retrieve_consume_count(); >>>>> > - if (consume_count == NULL) { >>>>> > - LOG_ERR("cannot retrieve expected consume >>>>> count.\n"); >>>>> > - return -1; >>>>> > - } >>>>> > - >>>>> > - while (odp_atomic_load_u32(consume_count) > 0) { >>>>> > - result = _ring_mc_dequeue_bulk(r, deq, >>>>> PIECE_BULK); >>>>> > - if (0 == result) { >>>>> > - /* evaluate the data pattern */ >>>>> > - for (i = 0; i < PIECE_BULK; i++) { >>>>> > - if (deq[i] != (void *)(unsigned >>>>> long)i) { >>>>> > - result = -1; >>>>> > - message = mismatch; >>>>> > - break; >>>>> > - } >>>>> > - } >>>>> > - >>>>> > - free(deq); >>>>> > - LOG_ERR("%s\n", message); >>>>> > + while ((int)odp_atomic_load_u32(consume_count) > 0) { >>>>> > + if (!_ring_mc_dequeue_bulk(r, deq, PIECE_BULK)) { >>>>> > odp_atomic_dec_u32(consume_count); >>>>> > - return result; >>>>> > + >>>>> > + /* evaluate the data pattern */ >>>>> > + for (i = 0; i < PIECE_BULK; i++) >>>>> > + CU_ASSERT(deq[i] == (void >>>>> *)(unsigned >>>>> > long)i); >>>>> > } >>>>> > - usleep(10); /* wait for producer threads */ >>>>> > } >>>>> > + >>>>> > + free(deq); >>>>> > return 0; >>>>> > } >>>>> > >>>>> > -- >>>>> > 2.7.1.250.gff4ea60 >>>>> > >>>>> > _______________________________________________ >>>>> > lng-odp mailing list >>>>> > lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> >>>>> > https://lists.linaro.org/mailman/listinfo/lng-odp >>>>> > >>>>> _______________________________________________ >>>>> lng-odp mailing list >>>>> lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> >>>>> https://lists.linaro.org/mailman/listinfo/lng-odp >>>>> >>>>> >>>>> >>>> _______________________________________________ >>>> lng-odp mailing list >>>> lng-odp@lists.linaro.org >>>> https://lists.linaro.org/mailman/listinfo/lng-odp >>>> >>> >>> >> >
On 26 May 2016 at 11:15, Ola Liljedahl <ola.liljedahl@linaro.org> wrote: > > > On 26 May 2016 at 10:02, Maxim Uvarov <maxim.uvarov@linaro.org> wrote: > >> >> >> On 26 May 2016 at 10:33, Ola Liljedahl <ola.liljedahl@linaro.org> wrote: >> >>> >>> >>> On 25 May 2016 at 17:18, Maxim Uvarov <maxim.uvarov@linaro.org> wrote: >>> >>>> issue is that -mcx16 and -O3 generates buggy code on gcc4.8 and gcc4.9, >>>> but works fine with gcc5.3 or on old gcc without optimization. I.e. with >>>> -O0. >>>> So 2 options: >>>> 1) change configure.ac to not add -mcx16 flags; >>>> 2) fix ring code to support -mcx16 on old compilers; >>>> >>> I don't understand this one. >>> If (ring) code works without -mcx16 (which enables cmpxchg16b >>> instruction on x86), it should work also with the -mcx16 flag. I can't >>> understand how the code on its own could be wrong. Maybe the compiler is >>> doing something wrong this option is specified (e.g. code generation gets >>> screwed up). >>> >>> >> Ola, I just wrote facts which I see on my 14.10 Ubuntu with gcc4.8 and >> gcc4.9. And Bill has gcc5.3 and he is unable to reproduce that bug. Also it >> works well with clang. I can spend some time to installing gcc5.4, >> disassembling 2 versions and compare what is the difference but I think we >> should not waist time on that if everything works well with newer compiler >> version. >> > So you don't actually know that the ring code can be "fixed" (correction > or work-around) in order to avoid this problem? > > I do agree that if we only see a problem with an older version of a > specific compiler, then it is most likely caused by a bug in that compiler > and the best thing for us is to either not support that compiler version at > all or (in this case) avoid using a compiler option which triggers the > problem. This is not what I questioned. You proposed (as one alternative) > fixing the ring code, then you should have had some clue as to what kind of > fix or workaround would be necessary, As any problem or weakness in the > ring code wasn't obvious to me, I asked you about it. > > Maybe this was just a rhetorical trick. Also suggest an impossible or > outrageous alternative so that we quicker select the simpler but perhaps > more bitter pill that as the only choice wouldn't have been welcomed. > Ola, might be I was not really clear describing my changes. 1. Original code was a little bit buggy and test was confused. I rewrote test to be simple understandable. I.e. to make threads fight for resources. And there was hang due to overflow of u32 which cause while(1) forever in all changes. 2. After 1, I found that code does not work on old compilers with -O3 -mcx16, with -O0 works. So it's 2 separate issues. Maxim. > > >> >> >> Maxim. >> >> >>> >>>> >>>> I think we should go first path. >>>> >>>> >>>> Maxim. >>>> >>>> On 05/25/16 04:24, Yi He wrote: >>>> >>>>> Hi, sorry about the memory leak issue and thanks Maxim for your patch. >>>>> >>>>> Best Regards, Yi >>>>> >>>>> On 25 May 2016 at 09:05, Bill Fischofer <bill.fischofer@linaro.org >>>>> <mailto:bill.fischofer@linaro.org>> wrote: >>>>> >>>>> On Tue, May 24, 2016 at 3:46 PM, Maxim Uvarov >>>>> <maxim.uvarov@linaro.org <mailto:maxim.uvarov@linaro.org>> >>>>> wrote: >>>>> >>>>> > Make test a little bit simple. Add memory free and >>>>> > take care about overflow using cast to int: >>>>> > (int)odp_atomic_load_u32(consume_count) >>>>> > Where number of consumer threads can dequeue from ring >>>>> > and decrease atomic u32. >>>>> > >>>>> > Signed-off-by: Maxim Uvarov <maxim.uvarov@linaro.org >>>>> <mailto:maxim.uvarov@linaro.org>> >>>>> > >>>>> >>>>> Reviewed-and-tested-by: Bill Fischofer <bill.fischofer@linaro.org >>>>> <mailto:bill.fischofer@linaro.org>> >>>>> >>>>> >>>>> > --- >>>>> > platform/linux-generic/test/ring/ring_stress.c | 74 >>>>> > ++++++++++++-------------- >>>>> > 1 file changed, 34 insertions(+), 40 deletions(-) >>>>> > >>>>> > diff --git a/platform/linux-generic/test/ring/ring_stress.c >>>>> > b/platform/linux-generic/test/ring/ring_stress.c >>>>> > index c68419f..a7e89a8 100644 >>>>> > --- a/platform/linux-generic/test/ring/ring_stress.c >>>>> > +++ b/platform/linux-generic/test/ring/ring_stress.c >>>>> > @@ -156,12 +156,11 @@ void >>>>> ring_test_stress_N_M_producer_consumer(void) >>>>> > consume_count = retrieve_consume_count(); >>>>> > CU_ASSERT(consume_count != NULL); >>>>> > >>>>> > - /* in N:M test case, producer threads are always >>>>> > - * greater or equal to consumer threads, thus produce >>>>> > - * enought "goods" to be consumed by consumer threads. >>>>> > + /* all producer threads try to fill ring to RING_SIZE, >>>>> > + * while consumers threads dequeue from ring with >>>>> PIECE_BULK >>>>> > + * blocks. Multiply on 100 to add more tries. >>>>> > */ >>>>> > - odp_atomic_init_u32(consume_count, >>>>> > - (worker_param.numthrds) / 2); >>>>> > + odp_atomic_init_u32(consume_count, RING_SIZE / >>>>> PIECE_BULK * 100); >>>>> > >>>>> > /* kick the workers */ >>>>> > odp_cunit_thread_create(stress_worker, &worker_param); >>>>> > @@ -202,8 +201,15 @@ static odp_atomic_u32_t >>>>> *retrieve_consume_count(void) >>>>> > /* worker function for multiple producer instances */ >>>>> > static int do_producer(_ring_t *r) >>>>> > { >>>>> > - int i, result = 0; >>>>> > + int i; >>>>> > void **enq = NULL; >>>>> > + odp_atomic_u32_t *consume_count; >>>>> > + >>>>> > + consume_count = retrieve_consume_count(); >>>>> > + if (consume_count == NULL) { >>>>> > + LOG_ERR("cannot retrieve expected consume >>>>> count.\n"); >>>>> > + return -1; >>>>> > + } >>>>> > >>>>> > /* allocate dummy object pointers for enqueue */ >>>>> > enq = malloc(PIECE_BULK * 2 * sizeof(void *)); >>>>> > @@ -216,26 +222,28 @@ static int do_producer(_ring_t *r) >>>>> > for (i = 0; i < PIECE_BULK; i++) >>>>> > enq[i] = (void *)(unsigned long)i; >>>>> > >>>>> > - do { >>>>> > - result = _ring_mp_enqueue_bulk(r, enq, >>>>> PIECE_BULK); >>>>> > - if (0 == result) { >>>>> > - free(enq); >>>>> > - return 0; >>>>> > - } >>>>> > - usleep(10); /* wait for consumer threads */ >>>>> > - } while (!_ring_full(r)); >>>>> > + while ((int)odp_atomic_load_u32(consume_count) > 0) { >>>>> > + /* produce as much data as we can to the ring */ >>>>> > + if (!_ring_mp_enqueue_bulk(r, enq, PIECE_BULK)) >>>>> > + usleep(10); >>>>> > + } >>>>> > >>>>> > + free(enq); >>>>> > return 0; >>>>> > } >>>>> > >>>>> > /* worker function for multiple consumer instances */ >>>>> > static int do_consumer(_ring_t *r) >>>>> > { >>>>> > - int i, result = 0; >>>>> > + int i; >>>>> > void **deq = NULL; >>>>> > - odp_atomic_u32_t *consume_count = NULL; >>>>> > - const char *message = "test OK!"; >>>>> > - const char *mismatch = "data mismatch..lockless enq/deq >>>>> failed."; >>>>> > + odp_atomic_u32_t *consume_count; >>>>> > + >>>>> > + consume_count = retrieve_consume_count(); >>>>> > + if (consume_count == NULL) { >>>>> > + LOG_ERR("cannot retrieve expected consume >>>>> count.\n"); >>>>> > + return -1; >>>>> > + } >>>>> > >>>>> > /* allocate dummy object pointers for dequeue */ >>>>> > deq = malloc(PIECE_BULK * 2 * sizeof(void *)); >>>>> > @@ -244,31 +252,17 @@ static int do_consumer(_ring_t *r) >>>>> > return 0; /* not failure, skip for insufficient >>>>> memory */ >>>>> > } >>>>> > >>>>> > - consume_count = retrieve_consume_count(); >>>>> > - if (consume_count == NULL) { >>>>> > - LOG_ERR("cannot retrieve expected consume >>>>> count.\n"); >>>>> > - return -1; >>>>> > - } >>>>> > - >>>>> > - while (odp_atomic_load_u32(consume_count) > 0) { >>>>> > - result = _ring_mc_dequeue_bulk(r, deq, >>>>> PIECE_BULK); >>>>> > - if (0 == result) { >>>>> > - /* evaluate the data pattern */ >>>>> > - for (i = 0; i < PIECE_BULK; i++) { >>>>> > - if (deq[i] != (void *)(unsigned >>>>> long)i) { >>>>> > - result = -1; >>>>> > - message = mismatch; >>>>> > - break; >>>>> > - } >>>>> > - } >>>>> > - >>>>> > - free(deq); >>>>> > - LOG_ERR("%s\n", message); >>>>> > + while ((int)odp_atomic_load_u32(consume_count) > 0) { >>>>> > + if (!_ring_mc_dequeue_bulk(r, deq, PIECE_BULK)) { >>>>> > odp_atomic_dec_u32(consume_count); >>>>> > - return result; >>>>> > + >>>>> > + /* evaluate the data pattern */ >>>>> > + for (i = 0; i < PIECE_BULK; i++) >>>>> > + CU_ASSERT(deq[i] == (void >>>>> *)(unsigned >>>>> > long)i); >>>>> > } >>>>> > - usleep(10); /* wait for producer threads */ >>>>> > } >>>>> > + >>>>> > + free(deq); >>>>> > return 0; >>>>> > } >>>>> > >>>>> > -- >>>>> > 2.7.1.250.gff4ea60 >>>>> > >>>>> > _______________________________________________ >>>>> > lng-odp mailing list >>>>> > lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> >>>>> > https://lists.linaro.org/mailman/listinfo/lng-odp >>>>> > >>>>> _______________________________________________ >>>>> lng-odp mailing list >>>>> lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> >>>>> https://lists.linaro.org/mailman/listinfo/lng-odp >>>>> >>>>> >>>>> >>>> _______________________________________________ >>>> lng-odp mailing list >>>> lng-odp@lists.linaro.org >>>> https://lists.linaro.org/mailman/listinfo/lng-odp >>>> >>> >>> >> >
On 26 May 2016 at 12:05, Yi He <yi.he@linaro.org> wrote: > Hi, Maxim, Ola and Bill > > Is it good to have a further investigation in RC4 to understand this? I > can do this since it seems introduced since my patch :), Maxim did you > mention this also happens to the old ringtest program (hang issue)? > No I don't think this is necessary. We see a problem only with older GCC versions and using an unusual compiler flag (together with -O3). The code (ring implementation and ring test code) works with newer GCC versions and other compilers. There could be a problem (race condition?) in the code (more likely ring test, the actual ring implementation is copied from DPDK, I added the memory ordering necessary for e.g. ARM) which is only exposed with this compiler and version but then I would expect this problem to resurface at some later time. What we should do is to run the ringtest on serious multi (many) core machines. That should be more prone to expose any race conditions. > > Best Regards, Yi > > On 26 May 2016 at 16:15, Ola Liljedahl <ola.liljedahl@linaro.org> wrote: > >> >> >> On 26 May 2016 at 10:02, Maxim Uvarov <maxim.uvarov@linaro.org> wrote: >> >>> >>> >>> On 26 May 2016 at 10:33, Ola Liljedahl <ola.liljedahl@linaro.org> wrote: >>> >>>> >>>> >>>> On 25 May 2016 at 17:18, Maxim Uvarov <maxim.uvarov@linaro.org> wrote: >>>> >>>>> issue is that -mcx16 and -O3 generates buggy code on gcc4.8 and >>>>> gcc4.9, but works fine with gcc5.3 or on old gcc without optimization. I.e. >>>>> with -O0. >>>>> So 2 options: >>>>> 1) change configure.ac to not add -mcx16 flags; >>>>> 2) fix ring code to support -mcx16 on old compilers; >>>>> >>>> I don't understand this one. >>>> If (ring) code works without -mcx16 (which enables cmpxchg16b >>>> instruction on x86), it should work also with the -mcx16 flag. I can't >>>> understand how the code on its own could be wrong. Maybe the compiler is >>>> doing something wrong this option is specified (e.g. code generation gets >>>> screwed up). >>>> >>>> >>> Ola, I just wrote facts which I see on my 14.10 Ubuntu with gcc4.8 and >>> gcc4.9. And Bill has gcc5.3 and he is unable to reproduce that bug. Also it >>> works well with clang. I can spend some time to installing gcc5.4, >>> disassembling 2 versions and compare what is the difference but I think we >>> should not waist time on that if everything works well with newer compiler >>> version. >>> >> So you don't actually know that the ring code can be "fixed" (correction >> or work-around) in order to avoid this problem? >> >> I do agree that if we only see a problem with an older version of a >> specific compiler, then it is most likely caused by a bug in that compiler >> and the best thing for us is to either not support that compiler version at >> all or (in this case) avoid using a compiler option which triggers the >> problem. This is not what I questioned. You proposed (as one alternative) >> fixing the ring code, then you should have had some clue as to what kind of >> fix or workaround would be necessary, As any problem or weakness in the >> ring code wasn't obvious to me, I asked you about it. >> >> Maybe this was just a rhetorical trick. Also suggest an impossible or >> outrageous alternative so that we quicker select the simpler but perhaps >> more bitter pill that as the only choice wouldn't have been welcomed. >> >> >>> >>> >>> Maxim. >>> >>> >>>> >>>>> >>>>> I think we should go first path. >>>>> >>>>> >>>>> Maxim. >>>>> >>>>> On 05/25/16 04:24, Yi He wrote: >>>>> >>>>>> Hi, sorry about the memory leak issue and thanks Maxim for your patch. >>>>>> >>>>>> Best Regards, Yi >>>>>> >>>>>> On 25 May 2016 at 09:05, Bill Fischofer <bill.fischofer@linaro.org >>>>>> <mailto:bill.fischofer@linaro.org>> wrote: >>>>>> >>>>>> On Tue, May 24, 2016 at 3:46 PM, Maxim Uvarov >>>>>> <maxim.uvarov@linaro.org <mailto:maxim.uvarov@linaro.org>> >>>>>> wrote: >>>>>> >>>>>> > Make test a little bit simple. Add memory free and >>>>>> > take care about overflow using cast to int: >>>>>> > (int)odp_atomic_load_u32(consume_count) >>>>>> > Where number of consumer threads can dequeue from ring >>>>>> > and decrease atomic u32. >>>>>> > >>>>>> > Signed-off-by: Maxim Uvarov <maxim.uvarov@linaro.org >>>>>> <mailto:maxim.uvarov@linaro.org>> >>>>>> > >>>>>> >>>>>> Reviewed-and-tested-by: Bill Fischofer <bill.fischofer@linaro.org >>>>>> <mailto:bill.fischofer@linaro.org>> >>>>>> >>>>>> >>>>>> > --- >>>>>> > platform/linux-generic/test/ring/ring_stress.c | 74 >>>>>> > ++++++++++++-------------- >>>>>> > 1 file changed, 34 insertions(+), 40 deletions(-) >>>>>> > >>>>>> > diff --git a/platform/linux-generic/test/ring/ring_stress.c >>>>>> > b/platform/linux-generic/test/ring/ring_stress.c >>>>>> > index c68419f..a7e89a8 100644 >>>>>> > --- a/platform/linux-generic/test/ring/ring_stress.c >>>>>> > +++ b/platform/linux-generic/test/ring/ring_stress.c >>>>>> > @@ -156,12 +156,11 @@ void >>>>>> ring_test_stress_N_M_producer_consumer(void) >>>>>> > consume_count = retrieve_consume_count(); >>>>>> > CU_ASSERT(consume_count != NULL); >>>>>> > >>>>>> > - /* in N:M test case, producer threads are always >>>>>> > - * greater or equal to consumer threads, thus produce >>>>>> > - * enought "goods" to be consumed by consumer threads. >>>>>> > + /* all producer threads try to fill ring to RING_SIZE, >>>>>> > + * while consumers threads dequeue from ring with >>>>>> PIECE_BULK >>>>>> > + * blocks. Multiply on 100 to add more tries. >>>>>> > */ >>>>>> > - odp_atomic_init_u32(consume_count, >>>>>> > - (worker_param.numthrds) / 2); >>>>>> > + odp_atomic_init_u32(consume_count, RING_SIZE / >>>>>> PIECE_BULK * 100); >>>>>> > >>>>>> > /* kick the workers */ >>>>>> > odp_cunit_thread_create(stress_worker, &worker_param); >>>>>> > @@ -202,8 +201,15 @@ static odp_atomic_u32_t >>>>>> *retrieve_consume_count(void) >>>>>> > /* worker function for multiple producer instances */ >>>>>> > static int do_producer(_ring_t *r) >>>>>> > { >>>>>> > - int i, result = 0; >>>>>> > + int i; >>>>>> > void **enq = NULL; >>>>>> > + odp_atomic_u32_t *consume_count; >>>>>> > + >>>>>> > + consume_count = retrieve_consume_count(); >>>>>> > + if (consume_count == NULL) { >>>>>> > + LOG_ERR("cannot retrieve expected consume >>>>>> count.\n"); >>>>>> > + return -1; >>>>>> > + } >>>>>> > >>>>>> > /* allocate dummy object pointers for enqueue */ >>>>>> > enq = malloc(PIECE_BULK * 2 * sizeof(void *)); >>>>>> > @@ -216,26 +222,28 @@ static int do_producer(_ring_t *r) >>>>>> > for (i = 0; i < PIECE_BULK; i++) >>>>>> > enq[i] = (void *)(unsigned long)i; >>>>>> > >>>>>> > - do { >>>>>> > - result = _ring_mp_enqueue_bulk(r, enq, >>>>>> PIECE_BULK); >>>>>> > - if (0 == result) { >>>>>> > - free(enq); >>>>>> > - return 0; >>>>>> > - } >>>>>> > - usleep(10); /* wait for consumer threads */ >>>>>> > - } while (!_ring_full(r)); >>>>>> > + while ((int)odp_atomic_load_u32(consume_count) > 0) { >>>>>> > + /* produce as much data as we can to the ring */ >>>>>> > + if (!_ring_mp_enqueue_bulk(r, enq, PIECE_BULK)) >>>>>> > + usleep(10); >>>>>> > + } >>>>>> > >>>>>> > + free(enq); >>>>>> > return 0; >>>>>> > } >>>>>> > >>>>>> > /* worker function for multiple consumer instances */ >>>>>> > static int do_consumer(_ring_t *r) >>>>>> > { >>>>>> > - int i, result = 0; >>>>>> > + int i; >>>>>> > void **deq = NULL; >>>>>> > - odp_atomic_u32_t *consume_count = NULL; >>>>>> > - const char *message = "test OK!"; >>>>>> > - const char *mismatch = "data mismatch..lockless enq/deq >>>>>> failed."; >>>>>> > + odp_atomic_u32_t *consume_count; >>>>>> > + >>>>>> > + consume_count = retrieve_consume_count(); >>>>>> > + if (consume_count == NULL) { >>>>>> > + LOG_ERR("cannot retrieve expected consume >>>>>> count.\n"); >>>>>> > + return -1; >>>>>> > + } >>>>>> > >>>>>> > /* allocate dummy object pointers for dequeue */ >>>>>> > deq = malloc(PIECE_BULK * 2 * sizeof(void *)); >>>>>> > @@ -244,31 +252,17 @@ static int do_consumer(_ring_t *r) >>>>>> > return 0; /* not failure, skip for insufficient >>>>>> memory */ >>>>>> > } >>>>>> > >>>>>> > - consume_count = retrieve_consume_count(); >>>>>> > - if (consume_count == NULL) { >>>>>> > - LOG_ERR("cannot retrieve expected consume >>>>>> count.\n"); >>>>>> > - return -1; >>>>>> > - } >>>>>> > - >>>>>> > - while (odp_atomic_load_u32(consume_count) > 0) { >>>>>> > - result = _ring_mc_dequeue_bulk(r, deq, >>>>>> PIECE_BULK); >>>>>> > - if (0 == result) { >>>>>> > - /* evaluate the data pattern */ >>>>>> > - for (i = 0; i < PIECE_BULK; i++) { >>>>>> > - if (deq[i] != (void *)(unsigned >>>>>> long)i) { >>>>>> > - result = -1; >>>>>> > - message = mismatch; >>>>>> > - break; >>>>>> > - } >>>>>> > - } >>>>>> > - >>>>>> > - free(deq); >>>>>> > - LOG_ERR("%s\n", message); >>>>>> > + while ((int)odp_atomic_load_u32(consume_count) > 0) { >>>>>> > + if (!_ring_mc_dequeue_bulk(r, deq, PIECE_BULK)) >>>>>> { >>>>>> > odp_atomic_dec_u32(consume_count); >>>>>> > - return result; >>>>>> > + >>>>>> > + /* evaluate the data pattern */ >>>>>> > + for (i = 0; i < PIECE_BULK; i++) >>>>>> > + CU_ASSERT(deq[i] == (void >>>>>> *)(unsigned >>>>>> > long)i); >>>>>> > } >>>>>> > - usleep(10); /* wait for producer threads */ >>>>>> > } >>>>>> > + >>>>>> > + free(deq); >>>>>> > return 0; >>>>>> > } >>>>>> > >>>>>> > -- >>>>>> > 2.7.1.250.gff4ea60 >>>>>> > >>>>>> > _______________________________________________ >>>>>> > lng-odp mailing list >>>>>> > lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> >>>>>> > https://lists.linaro.org/mailman/listinfo/lng-odp >>>>>> > >>>>>> _______________________________________________ >>>>>> lng-odp mailing list >>>>>> lng-odp@lists.linaro.org <mailto:lng-odp@lists.linaro.org> >>>>>> https://lists.linaro.org/mailman/listinfo/lng-odp >>>>>> >>>>>> >>>>>> >>>>> _______________________________________________ >>>>> lng-odp mailing list >>>>> lng-odp@lists.linaro.org >>>>> https://lists.linaro.org/mailman/listinfo/lng-odp >>>>> >>>> >>>> >>> >> >
diff --git a/platform/linux-generic/test/ring/ring_stress.c b/platform/linux-generic/test/ring/ring_stress.c index c68419f..a7e89a8 100644 --- a/platform/linux-generic/test/ring/ring_stress.c +++ b/platform/linux-generic/test/ring/ring_stress.c @@ -156,12 +156,11 @@ void ring_test_stress_N_M_producer_consumer(void) consume_count = retrieve_consume_count(); CU_ASSERT(consume_count != NULL); - /* in N:M test case, producer threads are always - * greater or equal to consumer threads, thus produce - * enought "goods" to be consumed by consumer threads. + /* all producer threads try to fill ring to RING_SIZE, + * while consumers threads dequeue from ring with PIECE_BULK + * blocks. Multiply on 100 to add more tries. */ - odp_atomic_init_u32(consume_count, - (worker_param.numthrds) / 2); + odp_atomic_init_u32(consume_count, RING_SIZE / PIECE_BULK * 100); /* kick the workers */ odp_cunit_thread_create(stress_worker, &worker_param); @@ -202,8 +201,15 @@ static odp_atomic_u32_t *retrieve_consume_count(void) /* worker function for multiple producer instances */ static int do_producer(_ring_t *r) { - int i, result = 0; + int i; void **enq = NULL; + odp_atomic_u32_t *consume_count; + + consume_count = retrieve_consume_count(); + if (consume_count == NULL) { + LOG_ERR("cannot retrieve expected consume count.\n"); + return -1; + } /* allocate dummy object pointers for enqueue */ enq = malloc(PIECE_BULK * 2 * sizeof(void *)); @@ -216,26 +222,28 @@ static int do_producer(_ring_t *r) for (i = 0; i < PIECE_BULK; i++) enq[i] = (void *)(unsigned long)i; - do { - result = _ring_mp_enqueue_bulk(r, enq, PIECE_BULK); - if (0 == result) { - free(enq); - return 0; - } - usleep(10); /* wait for consumer threads */ - } while (!_ring_full(r)); + while ((int)odp_atomic_load_u32(consume_count) > 0) { + /* produce as much data as we can to the ring */ + if (!_ring_mp_enqueue_bulk(r, enq, PIECE_BULK)) + usleep(10); + } + free(enq); return 0; } /* worker function for multiple consumer instances */ static int do_consumer(_ring_t *r) { - int i, result = 0; + int i; void **deq = NULL; - odp_atomic_u32_t *consume_count = NULL; - const char *message = "test OK!"; - const char *mismatch = "data mismatch..lockless enq/deq failed."; + odp_atomic_u32_t *consume_count; + + consume_count = retrieve_consume_count(); + if (consume_count == NULL) { + LOG_ERR("cannot retrieve expected consume count.\n"); + return -1; + } /* allocate dummy object pointers for dequeue */ deq = malloc(PIECE_BULK * 2 * sizeof(void *)); @@ -244,31 +252,17 @@ static int do_consumer(_ring_t *r) return 0; /* not failure, skip for insufficient memory */ } - consume_count = retrieve_consume_count(); - if (consume_count == NULL) { - LOG_ERR("cannot retrieve expected consume count.\n"); - return -1; - } - - while (odp_atomic_load_u32(consume_count) > 0) { - result = _ring_mc_dequeue_bulk(r, deq, PIECE_BULK); - if (0 == result) { - /* evaluate the data pattern */ - for (i = 0; i < PIECE_BULK; i++) { - if (deq[i] != (void *)(unsigned long)i) { - result = -1; - message = mismatch; - break; - } - } - - free(deq); - LOG_ERR("%s\n", message); + while ((int)odp_atomic_load_u32(consume_count) > 0) { + if (!_ring_mc_dequeue_bulk(r, deq, PIECE_BULK)) { odp_atomic_dec_u32(consume_count); - return result; + + /* evaluate the data pattern */ + for (i = 0; i < PIECE_BULK; i++) + CU_ASSERT(deq[i] == (void *)(unsigned long)i); } - usleep(10); /* wait for producer threads */ } + + free(deq); return 0; }
Make test a little bit simple. Add memory free and take care about overflow using cast to int: (int)odp_atomic_load_u32(consume_count) Where number of consumer threads can dequeue from ring and decrease atomic u32. Signed-off-by: Maxim Uvarov <maxim.uvarov@linaro.org> --- platform/linux-generic/test/ring/ring_stress.c | 74 ++++++++++++-------------- 1 file changed, 34 insertions(+), 40 deletions(-)