diff mbox

[Linaro-mm-sig,v3,2/3] mutex: add support for wound/wait style locks, v3

Message ID 20130428170407.17075.80082.stgit@patser
State New
Headers show

Commit Message

Maarten Lankhorst April 28, 2013, 5:04 p.m. UTC
Changes since RFC patch v1:
 - Updated to use atomic_long instead of atomic, since the reservation_id was a long.
 - added mutex_reserve_lock_slow and mutex_reserve_lock_intr_slow
 - removed mutex_locked_set_reservation_id (or w/e it was called)
Changes since RFC patch v2:
 - remove use of __mutex_lock_retval_arg, add warnings when using wrong combination of
   mutex_(,reserve_)lock/unlock.
Changes since v1:
 - Add __always_inline to __mutex_lock_common, otherwise reservation paths can be
   triggered from normal locks, because __builtin_constant_p might evaluate to false
   for the constant 0 in that case. Tests for this have been added in the next patch.
 - Updated documentation slightly.
Changes since v2:
 - Renamed everything to ww_mutex. (mlankhorst)
 - Added ww_acquire_ctx and ww_class. (mlankhorst)
 - Added a lot of checks for wrong api usage. (mlankhorst)
 - Documentation updates. (danvet)

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 Documentation/ww-mutex-design.txt |  322 +++++++++++++++++++++++++++
 include/linux/mutex-debug.h       |    1 
 include/linux/mutex.h             |  257 +++++++++++++++++++++
 kernel/mutex.c                    |  445 ++++++++++++++++++++++++++++++++++++-
 lib/debug_locks.c                 |    2 
 5 files changed, 1010 insertions(+), 17 deletions(-)
 create mode 100644 Documentation/ww-mutex-design.txt

Comments

Daniel Vetter April 30, 2013, 7:14 p.m. UTC | #1
On Sun, Apr 28, 2013 at 07:04:07PM +0200, Maarten Lankhorst wrote:
> Changes since RFC patch v1:
>  - Updated to use atomic_long instead of atomic, since the reservation_id was a long.
>  - added mutex_reserve_lock_slow and mutex_reserve_lock_intr_slow
>  - removed mutex_locked_set_reservation_id (or w/e it was called)
> Changes since RFC patch v2:
>  - remove use of __mutex_lock_retval_arg, add warnings when using wrong combination of
>    mutex_(,reserve_)lock/unlock.
> Changes since v1:
>  - Add __always_inline to __mutex_lock_common, otherwise reservation paths can be
>    triggered from normal locks, because __builtin_constant_p might evaluate to false
>    for the constant 0 in that case. Tests for this have been added in the next patch.
>  - Updated documentation slightly.
> Changes since v2:
>  - Renamed everything to ww_mutex. (mlankhorst)
>  - Added ww_acquire_ctx and ww_class. (mlankhorst)
>  - Added a lot of checks for wrong api usage. (mlankhorst)
>  - Documentation updates. (danvet)

While writing the kerneldoc I've carefully check that all restrictions are
enforced through debug checks somehow. I think that with full mutex debug
(including lockdep) enabled, plus the slowpath injector patch I've just
posted, _all_ interface abuse will be catched at runtime as long as all
the single-threaded/uncontended cases are exercises sufficiently.

So I think we've fully achieved level 5 on the Rusty API safety scale
here. Higher levels seem pretty hard given that the concepts are rather
fancy, but I think with the new (and much more consitent) naming, plus the
explicit introduction as (more abstruct) structures for ww_class and
ww_acquire_context the interface is about as intuitive as it gets.

So all together I'm pretty happy with what the interface looks like. And
one quick bikeshed below on the implementation.
-Daniel

> 
> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
>  Documentation/ww-mutex-design.txt |  322 +++++++++++++++++++++++++++
>  include/linux/mutex-debug.h       |    1 
>  include/linux/mutex.h             |  257 +++++++++++++++++++++
>  kernel/mutex.c                    |  445 ++++++++++++++++++++++++++++++++++++-
>  lib/debug_locks.c                 |    2 
>  5 files changed, 1010 insertions(+), 17 deletions(-)
>  create mode 100644 Documentation/ww-mutex-design.txt

[snip]

> +/*
> + * after acquiring lock with fastpath or when we lost out in contested
> + * slowpath, set ctx and wake up any waiters so they can recheck.
> + *
> + * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
> + * as the fastpath and opportunistic spinning are disabled in that case.
> + */
> +static __always_inline void
> +ww_mutex_set_context_fastpath(struct ww_mutex *lock,
> +			       struct ww_acquire_ctx *ctx)
> +{
> +	unsigned long flags;
> +	struct mutex_waiter *cur;
> +
> +	ww_mutex_lock_acquired(lock, ctx, false);
> +
> +	lock->ctx = ctx;
> +	smp_mb__after_atomic_dec();

I think this should be

+	smp_mb__after_atomic_dec();
+	lock->ctx = ctx;
+	smp_mb();

Also I wonder a bit how much this hurts the fastpath, and whether we
should just shovel the ctx into the atomic field with a cmpxcht, like the
rt mutex code does with the current pointer.

> +
> +	/*
> +	 * Check if lock is contended, if not there is nobody to wake up
> +	 */
> +	if (likely(atomic_read(&lock->base.count) == 0))
> +		return;
> +
> +	/*
> +	 * Uh oh, we raced in fastpath, wake up everyone in this case,
> +	 * so they can see the new ctx
> +	 */
> +	spin_lock_mutex(&lock->base.wait_lock, flags);
> +	list_for_each_entry(cur, &lock->base.wait_list, list) {
> +		debug_mutex_wake_waiter(&lock->base, cur);
> +		wake_up_process(cur->task);
> +	}
> +	spin_unlock_mutex(&lock->base.wait_lock, flags);
> +}
> +
Maarten Lankhorst May 22, 2013, 11:18 a.m. UTC | #2
Hey,

Op 30-04-13 21:14, Daniel Vetter schreef:
> On Sun, Apr 28, 2013 at 07:04:07PM +0200, Maarten Lankhorst wrote:
>> Changes since RFC patch v1:
>>  - Updated to use atomic_long instead of atomic, since the reservation_id was a long.
>>  - added mutex_reserve_lock_slow and mutex_reserve_lock_intr_slow
>>  - removed mutex_locked_set_reservation_id (or w/e it was called)
>> Changes since RFC patch v2:
>>  - remove use of __mutex_lock_retval_arg, add warnings when using wrong combination of
>>    mutex_(,reserve_)lock/unlock.
>> Changes since v1:
>>  - Add __always_inline to __mutex_lock_common, otherwise reservation paths can be
>>    triggered from normal locks, because __builtin_constant_p might evaluate to false
>>    for the constant 0 in that case. Tests for this have been added in the next patch.
>>  - Updated documentation slightly.
>> Changes since v2:
>>  - Renamed everything to ww_mutex. (mlankhorst)
>>  - Added ww_acquire_ctx and ww_class. (mlankhorst)
>>  - Added a lot of checks for wrong api usage. (mlankhorst)
>>  - Documentation updates. (danvet)
> While writing the kerneldoc I've carefully check that all restrictions are
> enforced through debug checks somehow. I think that with full mutex debug
> (including lockdep) enabled, plus the slowpath injector patch I've just
> posted, _all_ interface abuse will be catched at runtime as long as all
> the single-threaded/uncontended cases are exercises sufficiently.
>
> So I think we've fully achieved level 5 on the Rusty API safety scale
> here. Higher levels seem pretty hard given that the concepts are rather
> fancy, but I think with the new (and much more consitent) naming, plus the
> explicit introduction as (more abstruct) structures for ww_class and
> ww_acquire_context the interface is about as intuitive as it gets.
>
> So all together I'm pretty happy with what the interface looks like. And
> one quick bikeshed below on the implementation.
> -Daniel
I included your fix below. I'm hoping to get this included in 3.11 through the drm tree, so
I can convert ttm to use it, but I haven't received any further reply on the patch series.

The 3.10 mutex improvement patches don't seem to cause any conflicts when merging
linus' tree, so I'll use drm-next as a base.

Are there any issues left? I included the patch you wrote for injecting -EDEADLK too
in my tree. The overwhelming silence makes me think there are either none, or
nobody cared enough to review it. :(

>> +/*
>> + * after acquiring lock with fastpath or when we lost out in contested
>> + * slowpath, set ctx and wake up any waiters so they can recheck.
>> + *
>> + * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
>> + * as the fastpath and opportunistic spinning are disabled in that case.
>> + */
>> +static __always_inline void
>> +ww_mutex_set_context_fastpath(struct ww_mutex *lock,
>> +			       struct ww_acquire_ctx *ctx)
>> +{
>> +	unsigned long flags;
>> +	struct mutex_waiter *cur;
>> +
>> +	ww_mutex_lock_acquired(lock, ctx, false);
>> +
>> +	lock->ctx = ctx;
>> +	smp_mb__after_atomic_dec();
> I think this should be
>
> +	smp_mb__after_atomic_dec();
> +	lock->ctx = ctx;
> +	smp_mb();
>
> Also I wonder a bit how much this hurts the fastpath, and whether we
> should just shovel the ctx into the atomic field with a cmpxcht, like the
> rt mutex code does with the current pointer.
>
Fixed. I'm not sure if the second smp_mb is really needed. If there was a
smp_mb__before_atomic_read it would have been sufficient.

~Maarten
Peter Zijlstra May 22, 2013, 11:37 a.m. UTC | #3
> Are there any issues left? I included the patch you wrote for injecting -EDEADLK too
> in my tree. The overwhelming silence makes me think there are either none, or
> nobody cared enough to review it. :(

It didn't manage to reach my inbox it seems,.. I can only find a debug
patch in this thread.
Maarten Lankhorst May 22, 2013, 11:47 a.m. UTC | #4
Op 22-05-13 13:37, Peter Zijlstra schreef:
>> Are there any issues left? I included the patch you wrote for injecting -EDEADLK too
>> in my tree. The overwhelming silence makes me think there are either none, or
>> nobody cared enough to review it. :(
> It didn't manage to reach my inbox it seems,.. I can only find a debug
> patch in this thread.
>
Odd, maybe in your spam folder?
It arrived on all mailing lists, so I have no idea why you were left out.

http://www.spinics.net/lists/linux-arch/msg21425.html


~Maarten
Peter Zijlstra May 22, 2013, 12:07 p.m. UTC | #5
On Wed, May 22, 2013 at 01:47:42PM +0200, Maarten Lankhorst wrote:
> Op 22-05-13 13:37, Peter Zijlstra schreef:
> >> Are there any issues left? I included the patch you wrote for injecting -EDEADLK too
> >> in my tree. The overwhelming silence makes me think there are either none, or
> >> nobody cared enough to review it. :(
> > It didn't manage to reach my inbox it seems,.. I can only find a debug
> > patch in this thread.
> >
> Odd, maybe in your spam folder?

Couldn't spot it there either.. weird.

> It arrived on all mailing lists,

I should both clean up my one huge lkml maildir and hack notmuch into
submission so I can read LKML again :/

> so I have no idea why you were left out.
> 
> http://www.spinics.net/lists/linux-arch/msg21425.html

Thanks, I'll go stare at it.
Peter Zijlstra May 22, 2013, 4:18 p.m. UTC | #6
On Wed, May 22, 2013 at 01:18:14PM +0200, Maarten Lankhorst wrote:

Lacking the actual msg atm, I'm going to paste in here...
  	
> Subject: [PATCH v3 2/3] mutex: add support for wound/wait style locks, v3
> From: Maarten Lankhorst <maarten.lankhorst@xxxxxxxxxxxxx>
> 
> Changes since RFC patch v1:
>  - Updated to use atomic_long instead of atomic, since the reservation_id was a long.
>  - added mutex_reserve_lock_slow and mutex_reserve_lock_intr_slow
>  - removed mutex_locked_set_reservation_id (or w/e it was called)
> Changes since RFC patch v2:
>  - remove use of __mutex_lock_retval_arg, add warnings when using wrong combination of
>    mutex_(,reserve_)lock/unlock.
> Changes since v1:
>  - Add __always_inline to __mutex_lock_common, otherwise reservation paths can be
>    triggered from normal locks, because __builtin_constant_p might evaluate to false
>    for the constant 0 in that case. Tests for this have been added in the next patch.
>  - Updated documentation slightly.
> Changes since v2:
>  - Renamed everything to ww_mutex. (mlankhorst)
>  - Added ww_acquire_ctx and ww_class. (mlankhorst)
>  - Added a lot of checks for wrong api usage. (mlankhorst)
>  - Documentation updates. (danvet)
> 
> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@xxxxxxxxxxxxx>
> Signed-off-by: Daniel Vetter <daniel.vetter@xxxxxxxx>
> ---
>  Documentation/ww-mutex-design.txt |  322 +++++++++++++++++++++++++++
>  include/linux/mutex-debug.h       |    1 
>  include/linux/mutex.h             |  257 +++++++++++++++++++++
>  kernel/mutex.c                    |  445 ++++++++++++++++++++++++++++++++++++-
>  lib/debug_locks.c                 |    2 
>  5 files changed, 1010 insertions(+), 17 deletions(-)
>  create mode 100644 Documentation/ww-mutex-design.txt
> 
> diff --git a/Documentation/ww-mutex-design.txt b/Documentation/ww-mutex-design.txt
> new file mode 100644
> index 0000000..154bae3
> --- /dev/null
> +++ b/Documentation/ww-mutex-design.txt
> @@ -0,0 +1,322 @@
> +Wait/Wound Deadlock-Proof Mutex Design
> +======================================
> +
> +Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
> +
> +Motivation for WW-Mutexes
> +-------------------------
> +
> +GPU's do operations that commonly involve many buffers.  Those buffers
> +can be shared across contexts/processes, exist in different memory
> +domains (for example VRAM vs system memory), and so on.  And with
> +PRIME / dmabuf, they can even be shared across devices.  So there are
> +a handful of situations where the driver needs to wait for buffers to
> +become ready.  If you think about this in terms of waiting on a buffer
> +mutex for it to become available, this presents a problem because
> +there is no way to guarantee that buffers appear in a execbuf/batch in
> +the same order in all contexts.  That is directly under control of
> +userspace, and a result of the sequence of GL calls that an application
> +makes.	Which results in the potential for deadlock.  The problem gets
> +more complex when you consider that the kernel may need to migrate the
> +buffer(s) into VRAM before the GPU operates on the buffer(s), which
> +may in turn require evicting some other buffers (and you don't want to
> +evict other buffers which are already queued up to the GPU), but for a
> +simplified understanding of the problem you can ignore this.
> +
> +The algorithm that TTM came up with for dealing with this problem is quite
> +simple.  For each group of buffers (execbuf) that need to be locked, the caller
> +would be assigned a unique reservation id/ticket, from a global counter.  In
> +case of deadlock while locking all the buffers associated with a execbuf, the
> +one with the lowest reservation ticket (i.e. the oldest task) wins, and the one
> +with the higher reservation id (i.e. the younger task) unlocks all of the
> +buffers that it has already locked, and then tries again.
> +
> +In the RDBMS literature this deadlock handling approach is called wait/wound:
> +The older tasks waits until it can acquire the contended lock. The younger tasks
> +needs to back off and drop all the locks it is currently holding, i.e. the
> +younger task is wounded.
> +
> +Concepts
> +--------
> +
> +Compared to normal mutexes two additional concepts/objects show up in the lock
> +interface for w/w mutexes:
> +
> +Acquire context: To ensure eventual forward progress it is important the a task
> +trying to acquire locks doesn't grab a new reservation id, but keeps the one it
> +acquired when starting the lock acquisition. This ticket is stored in the
> +acquire context. Furthermore the acquire context keeps track of debugging state
> +to catch w/w mutex interface abuse.
> +
> +W/w class: In contrast to normal mutexes the lock class needs to be explicit for
> +w/w mutexes, since it is required to initialize the acquire context.
> +
> +Furthermore there are three different classe of w/w lock acquire functions:
> +- Normal lock acquisition with a context, using ww_mutex_lock
> +- Slowpath lock acquisition on the contending lock, used by the wounded task
> +  after having dropped all already acquired locks. These functions have the
> +  _slow postfix.

See below, I don't see the need for this interface.

> +- Functions to only acquire a single w/w mutex, which results in the exact same
> +  semantics as a normal mutex. These functions have the _single postfix.

This is missing rationale.

> +
> +Of course, all the usual variants for handling wake-ups due to signals are also
> +provided.
> +
> +Usage
> +-----
> +
> +Three different ways to acquire locks within the same w/w class. Common
> +definitions for methods 1&2.
> +
> +static DEFINE_WW_CLASS(ww_class);
> +
> +struct obj {
> +	sct ww_mutex lock;
> +	/* obj data */
> +};
> +
> +struct obj_entry {
> +	struct list_head *list;
> +	struct obj *obj;
> +};
> +
> +Method 1, using a list in execbuf->buffers that's not allowed to be reordered.
> +This is useful if a list of required objects is already tracked somewhere.
> +Furthermore the lock helper can use propagate the -EALREADY return code back to
> +the caller as a signal that an object is twice on the list. This is useful if
> +the list is constructed from userspace input and the ABI requires userspace to
> +no have duplicate entries (e.g. for a gpu commandbuffer submission ioctl).
> +
> +int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
> +{
> +	struct obj *res_obj = NULL;
> +	struct obj_entry *contended_entry = NULL;
> +	struct obj_entry *entry;
> +
> +	ww_acquire_init(ctx, &ww_class);
> +
> +retry:
> +	list_for_each_entry (list, entry) {
> +		if (entry == res_obj) {
> +			res_obj = NULL;
> +			continue;
> +		}
> +		ret = ww_mutex_lock(&entry->obj->lock, ctx);
> +		if (ret < 0) {
> +			contended_obj = entry;
> +			goto err;
> +		}
> +	}
> +
> +	ww_acquire_done(ctx);
> +	return 0;
> +
> +err:
> +	list_for_each_entry_continue_reverse (list, contended_entry, entry)
> +		ww_mutex_unlock(&entry->obj->lock);
> +
> +	if (res_obj)
> +		ww_mutex_unlock(&res_obj->lock);
> +
> +	if (ret == -EDEADLK) {
> +		/* we lost out in a seqno race, lock and retry.. */
> +		ww_mutex_lock_slow(&contended_entry->obj->lock, ctx);

I missing the need for ww_mutex_lock_slow(). AFAICT we should be able to tell
its the first lock in the ctx and thus we cannot possibly deadlock.

> +		res_obj = contended_entry->obj;
> +		goto retry;
> +	}
> +	ww_acquire_fini(ctx);
> +
> +	return ret;
> +}
> +

... you certainly went all out on documentation.

> diff --git a/include/linux/mutex-debug.h b/include/linux/mutex-debug.h
> index 731d77d..4ac8b19 100644
> --- a/include/linux/mutex-debug.h
> +++ b/include/linux/mutex-debug.h
> @@ -3,6 +3,7 @@
>  
>  #include <linux/linkage.h>
>  #include <linux/lockdep.h>
> +#include <linux/debug_locks.h>
>  
>  /*
>   * Mutexes - debugging helpers:
> diff --git a/include/linux/mutex.h b/include/linux/mutex.h
> index 9121595..004f863 100644
> --- a/include/linux/mutex.h
> +++ b/include/linux/mutex.h
> @@ -74,6 +74,35 @@ struct mutex_waiter {
>  #endif
>  };
>  
> +struct ww_class {
> +	atomic_long_t stamp;
> +	struct lock_class_key acquire_key;
> +	struct lock_class_key mutex_key;
> +	const char *acquire_name;
> +	const char *mutex_name;
> +};
> +
> +struct ww_acquire_ctx {
> +	struct task_struct *task;
> +	unsigned long stamp;
> +#ifdef CONFIG_DEBUG_MUTEXES
> +	unsigned acquired, done_acquire;
> +	struct ww_class *ww_class;
> +	struct ww_mutex *contending_lock;
> +#endif
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> +	struct lockdep_map dep_map;
> +#endif
> +};
> +
> +struct ww_mutex {
> +	struct mutex base;
> +	struct ww_acquire_ctx *ctx;
> +#ifdef CONFIG_DEBUG_MUTEXES
> +	struct ww_class *ww_class;
> +#endif
> +};
> +


> @@ -167,6 +236,192 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
>   */
>  extern int mutex_trylock(struct mutex *lock);
>  extern void mutex_unlock(struct mutex *lock);
> +
> +/**
> + * ww_acquire_init - initialize a w/w acquire context
> + * @ctx: w/w acquire context to initialize
> + * @ww_class: w/w class of the context
> + *
> + * Initializes an context to acquire multiple mutexes of the given w/w class.
> + *
> + * Context-based w/w mutex acquiring can be done in any order whatsoever within
> + * a given lock class. Deadlocks will be detected and handled with the
> + * wait/wound logic.
> + *
> + * Mixing of context-based w/w mutex acquiring and single w/w mutex locking can
> + * result in undetected deadlocks and is so forbidden. Mixing different contexts
> + * for the same w/w class when acquiring mutexes can also result in undetected
> + * deadlocks, and is hence also forbidden.
> + *
> + * Nesting of acquire contexts for _different_ w/w classes is possible, subject
> + * to the usual locking rules between different lock classes.
> + *
> + * An acquire context must be release by the same task before the memory is
> + * freed with ww_acquire_fini. It is recommended to allocate the context itself
> + * on the stack.
> + */
> +static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
> +				   struct ww_class *ww_class)
> +{
> +	ctx->task = current;
> +	do {
> +		ctx->stamp = atomic_long_inc_return(&ww_class->stamp);
> +	} while (unlikely(!ctx->stamp));

I suppose we'll figure something out when this becomes a bottleneck. Ideally
we'd do something like:

 ctx->stamp = local_clock();

but for now we cannot guarantee that's not jiffies, and I suppose that's a tad
too coarse to work for this.

Also, why is 0 special?

> +#ifdef CONFIG_DEBUG_MUTEXES
> +	ctx->ww_class = ww_class;
> +	ctx->acquired = ctx->done_acquire = 0;
> +	ctx->contending_lock = NULL;
> +#endif
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> +	debug_check_no_locks_freed((void *)ctx, sizeof(*ctx));
> +	lockdep_init_map(&ctx->dep_map, ww_class->acquire_name,
> +			 &ww_class->acquire_key, 0);
> +	mutex_acquire(&ctx->dep_map, 0, 0, _RET_IP_);
> +#endif
> +}

> +/**
> + * ww_mutex_trylock_single - tries to acquire the w/w mutex without acquire context
> + * @lock: mutex to lock
> + *
> + * Trylocks a mutex without acquire context, so no deadlock detection is
> + * possible. Returns 0 if the mutex has been acquired.
> + *
> + * Unlocking the mutex must happen with a call to ww_mutex_unlock_single.
> + */
> +static inline int __must_check ww_mutex_trylock_single(struct ww_mutex *lock)
> +{
> +	return mutex_trylock(&lock->base);
> +}

trylocks can never deadlock they don't block per definition, I don't see the
point of the _single() thing here.

> +/**
> + * ww_mutex_lock_single - acquire the w/w mutex without acquire context
> + * @lock: mutex to lock
> + *
> + * Locks a mutex without acquire context, so no deadlock detection is
> + * possible.
> + *
> + * Unlocking the mutex must happen with a call to ww_mutex_unlock_single.
> + */
> +static inline void ww_mutex_lock_single(struct ww_mutex *lock)
> +{
> +	mutex_lock(&lock->base);
> +}

as per the above, I'm missing the rationale for having this.

> diff --git a/kernel/mutex.c b/kernel/mutex.c
> index 84a5f07..66807c7 100644
> --- a/kernel/mutex.c
> +++ b/kernel/mutex.c
> @@ -127,16 +127,156 @@ void __sched mutex_unlock(struct mutex *lock)
>  
>  EXPORT_SYMBOL(mutex_unlock);
>  
> +/**
> + * ww_mutex_unlock - release the w/w mutex
> + * @lock: the mutex to be released
> + *
> + * Unlock a mutex that has been locked by this task previously
> + * with ww_mutex_lock* using an acquire context. It is forbidden to release the
> + * locks after releasing the acquire context.
> + *
> + * This function must not be used in interrupt context. Unlocking
> + * of a unlocked mutex is not allowed.
> + *
> + * Note that locks acquired with one of the ww_mutex_lock*single variant must be
> + * unlocked with ww_mutex_unlock_single.
> + */
> +void __sched ww_mutex_unlock(struct ww_mutex *lock)
> +{
> +	/*
> +	 * The unlocking fastpath is the 0->1 transition from 'locked'
> +	 * into 'unlocked' state:
> +	 */
> +#ifdef CONFIG_DEBUG_MUTEXES
> +	DEBUG_LOCKS_WARN_ON(!lock->ctx);
> +	if (lock->ctx) {
> +		DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
> +		if (lock->ctx->acquired > 0)
> +			lock->ctx->acquired--;
> +	}
> +#endif
> +	lock->ctx = NULL;

barriers should always have a comment explaining the exact ordering and the
pairing barrier's location.

> +	smp_mb__before_atomic_inc();
> +
> +#ifndef CONFIG_DEBUG_MUTEXES
> +	/*
> +	 * When debugging is enabled we must not clear the owner before time,
> +	 * the slow path will always be taken, and that clears the owner field
> +	 * after verifying that it was indeed current.
> +	 */
> +	mutex_clear_owner(&lock->base);
> +#endif
> +	__mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath);
> +}
> +EXPORT_SYMBOL(ww_mutex_unlock);
> +
> +static inline int __sched
> +__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
> +{
> +	struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
> +	struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
> +
> +	if (!hold_ctx)
> +		return 0;
> +
> +	if (unlikely(ctx->stamp == hold_ctx->stamp))
> +		return -EALREADY;

Why compare stamps? I expected: ctx == hold_ctx here.

> +
> +	if (unlikely(ctx->stamp - hold_ctx->stamp <= LONG_MAX)) {

Why not simply write: ctx->stamp > hold_ctx->stamp ?

If we need to deal with equal stamps from different contexts we could tie-break
based on ctx address or so, but seeing its a global counter from the class,
that shouldn't happen for now.

> +#ifdef CONFIG_DEBUG_MUTEXES
> +		DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
> +		ctx->contending_lock = ww;
> +#endif
> +		return -EDEADLK;
> +	}
> +
> +	return 0;
> +}
> +
> +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
> +						   struct ww_acquire_ctx *ww_ctx,
> +						   bool ww_slow)
> +{
> +#ifdef CONFIG_DEBUG_MUTEXES
> +	/*
> +	 * If this WARN_ON triggers, you used mutex_lock to acquire,
> +	 * but released with ww_mutex_unlock in this call.
> +	 */
> +	DEBUG_LOCKS_WARN_ON(ww->ctx);
> +
> +	/*
> +	 * Not quite done after ww_acquire_done() ?
> +	 */
> +	DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
> +
> +	if (ww_slow) {

s/ww_slow/!ww_ctx->acquired/

> +		DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
> +		ww_ctx->contending_lock = NULL;
> +	} else
> +		DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
> +
> +
> +	/*
> +	 * Naughty, using a different class can lead to undefined behavior!
> +	 */
> +	DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
> +
> +	if (ww_slow)
> +		DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
> +
> +	ww_ctx->acquired++;
> +#endif
> +}
> +
> +/*
> + * after acquiring lock with fastpath or when we lost out in contested
> + * slowpath, set ctx and wake up any waiters so they can recheck.
> + *
> + * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
> + * as the fastpath and opportunistic spinning are disabled in that case.
> + */
> +static __always_inline void
> +ww_mutex_set_context_fastpath(struct ww_mutex *lock,
> +			       struct ww_acquire_ctx *ctx)
> +{
> +	unsigned long flags;
> +	struct mutex_waiter *cur;
> +
> +	ww_mutex_lock_acquired(lock, ctx, false);
> +
> +	lock->ctx = ctx;

 missing comment

> +	smp_mb__after_atomic_dec();
> +
> +	/*
> +	 * Check if lock is contended, if not there is nobody to wake up
> +	 */
> +	if (likely(atomic_read(&lock->base.count) == 0))
> +		return;
> +
> +	/*
> +	 * Uh oh, we raced in fastpath, wake up everyone in this case,
> +	 * so they can see the new ctx
> +	 */
> +	spin_lock_mutex(&lock->base.wait_lock, flags);
> +	list_for_each_entry(cur, &lock->base.wait_list, list) {
> +		debug_mutex_wake_waiter(&lock->base, cur);
> +		wake_up_process(cur->task);
> +	}
> +	spin_unlock_mutex(&lock->base.wait_lock, flags);
> +}
> +
>  /*
>   * Lock a mutex (possibly interruptible), slowpath:
>   */
> -static inline int __sched
> +static __always_inline int __sched
>  __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
> -		    struct lockdep_map *nest_lock, unsigned long ip)
> +		    struct lockdep_map *nest_lock, unsigned long ip,
> +		    struct ww_acquire_ctx *ww_ctx, bool ww_slow)
>  {
>  	struct task_struct *task = current;
>  	struct mutex_waiter waiter;
>  	unsigned long flags;
> +	int ret;
>  
>  	preempt_disable();
>  	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
> @@ -163,6 +303,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
>  	for (;;) {
>  		struct task_struct *owner;
>  
> +		if (!__builtin_constant_p(ww_ctx == NULL) && !ww_slow) {

Since we _know_ ww_ctx isn't NULL, we can trivially do: s/ww_slow/!ww_ctx->acquired/

> +			struct ww_mutex *ww;
> +
> +			ww = container_of(lock, struct ww_mutex, base);
> +			if (ACCESS_ONCE(ww->ctx))

What's the point of this ACCESS_ONCE()?

> +				break;
> +		}
> +
>  		/*
>  		 * If there's an owner, wait for it to either
>  		 * release the lock or go to sleep.
> @@ -173,6 +321,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
>  
>  		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
>  			lock_acquired(&lock->dep_map, ip);

Should this not also have a __builtin_constant_p(ww_ctx == NULL) ?

> +			if (ww_slow) {
> +				struct ww_mutex *ww;
> +				ww = container_of(lock, struct ww_mutex, base);
> +
> +				ww_mutex_set_context_fastpath(ww, ww_ctx);
> +			}
> +
>  			mutex_set_owner(lock);
>  			preempt_enable();
>  			return 0;
> @@ -228,15 +383,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
>  		 * TASK_UNINTERRUPTIBLE case.)
>  		 */
>  		if (unlikely(signal_pending_state(state, task))) {
> -			mutex_remove_waiter(lock, &waiter,
> -					    task_thread_info(task));
> -			mutex_release(&lock->dep_map, 1, ip);
> -			spin_unlock_mutex(&lock->wait_lock, flags);
> +			ret = -EINTR;
> +			goto err;
> +		}
>  
> -			debug_mutex_free_waiter(&waiter);
> -			preempt_enable();
> -			return -EINTR;
> +		if (!__builtin_constant_p(ww_ctx == NULL) && !ww_slow) {
> +			ret = __mutex_lock_check_stamp(lock, ww_ctx);
> +			if (ret)
> +				goto err;
>  		}
> +
>  		__set_task_state(task, state);
>  
>  		/* didn't get the lock, go to sleep: */
> @@ -251,6 +407,30 @@ done:
>  	mutex_remove_waiter(lock, &waiter, current_thread_info());
>  	mutex_set_owner(lock);
>  
> +	if (!__builtin_constant_p(ww_ctx == NULL)) {
> +		struct ww_mutex *ww = container_of(lock,
> +						      struct ww_mutex,
> +						      base);
> +		struct mutex_waiter *cur;
> +
> +		/*
> +		 * This branch gets optimized out for the common case,
> +		 * and is only important for ww_mutex_lock.
> +		 */
> +
> +		ww_mutex_lock_acquired(ww, ww_ctx, ww_slow);
> +		ww->ctx = ww_ctx;
> +
> +		/*
> +		 * Give any possible sleeping processes the chance to wake up,
> +		 * so they can recheck if they have to back off.
> +		 */
> +		list_for_each_entry(cur, &lock->wait_list, list) {
> +			debug_mutex_wake_waiter(lock, cur);
> +			wake_up_process(cur->task);
> +		}
> +	}
> +
>  	/* set it to 0 if there are no waiters left: */
>  	if (likely(list_empty(&lock->wait_list)))
>  		atomic_set(&lock->count, 0);
> @@ -261,6 +441,14 @@ done:
>  	preempt_enable();
>  
>  	return 0;
> +
> +err:
> +	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
> +	spin_unlock_mutex(&lock->wait_lock, flags);
> +	debug_mutex_free_waiter(&waiter);
> +	mutex_release(&lock->dep_map, 1, ip);
> +	preempt_enable();
> +	return ret;
>  }
>  
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
> @@ -268,7 +456,8 @@ void __sched
>  mutex_lock_nested(struct mutex *lock, unsigned int subclass)
>  {
>  	might_sleep();
> -	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
> +	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
> +			    subclass, NULL, _RET_IP_, 0, 0);
>  }

The pendant in me has to tell you 4x that NULL != 0 :-)

> +EXPORT_SYMBOL_GPL(ww_mutex_lock);
> +EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
> +EXPORT_SYMBOL_GPL(ww_mutex_lock_slow);
> +EXPORT_SYMBOL_GPL(ww_mutex_lock_slow_interruptible);

Now having to do the _slow stuff saves lines and interface complexity!

> @@ -401,20 +738,39 @@ __mutex_lock_slowpath(atomic_t *lock_count)
>  {
>  	struct mutex *lock = container_of(lock_count, struct mutex, count);
>  
> -	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
> +	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
> +			    NULL, _RET_IP_, 0, 0);
>  }
>  
>  static noinline int __sched
>  __mutex_lock_killable_slowpath(struct mutex *lock)
>  {
> -	return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
> +	return __mutex_lock_common(lock, TASK_KILLABLE, 0,
> +				   NULL, _RET_IP_, 0, 0);
>  }
>  
>  static noinline int __sched
>  __mutex_lock_interruptible_slowpath(struct mutex *lock)
>  {
> -	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
> +	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
> +				   NULL, _RET_IP_, 0, 0);
>  }

A few more cases where NULL != 0 :-)
Daniel Vetter May 22, 2013, 4:49 p.m. UTC | #7
On Wed, May 22, 2013 at 6:18 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, May 22, 2013 at 01:18:14PM +0200, Maarten Lankhorst wrote:
>
> Lacking the actual msg atm, I'm going to paste in here...

Just replying to the interface/doc comments, Maarten's the guy for the
gory details ;-)

>> Subject: [PATCH v3 2/3] mutex: add support for wound/wait style locks, v3
>> From: Maarten Lankhorst <maarten.lankhorst@xxxxxxxxxxxxx>
>>
>> Changes since RFC patch v1:
>>  - Updated to use atomic_long instead of atomic, since the reservation_id was a long.
>>  - added mutex_reserve_lock_slow and mutex_reserve_lock_intr_slow
>>  - removed mutex_locked_set_reservation_id (or w/e it was called)
>> Changes since RFC patch v2:
>>  - remove use of __mutex_lock_retval_arg, add warnings when using wrong combination of
>>    mutex_(,reserve_)lock/unlock.
>> Changes since v1:
>>  - Add __always_inline to __mutex_lock_common, otherwise reservation paths can be
>>    triggered from normal locks, because __builtin_constant_p might evaluate to false
>>    for the constant 0 in that case. Tests for this have been added in the next patch.
>>  - Updated documentation slightly.
>> Changes since v2:
>>  - Renamed everything to ww_mutex. (mlankhorst)
>>  - Added ww_acquire_ctx and ww_class. (mlankhorst)
>>  - Added a lot of checks for wrong api usage. (mlankhorst)
>>  - Documentation updates. (danvet)
>>
>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@xxxxxxxxxxxxx>
>> Signed-off-by: Daniel Vetter <daniel.vetter@xxxxxxxx>
>> ---
>>  Documentation/ww-mutex-design.txt |  322 +++++++++++++++++++++++++++
>>  include/linux/mutex-debug.h       |    1
>>  include/linux/mutex.h             |  257 +++++++++++++++++++++
>>  kernel/mutex.c                    |  445 ++++++++++++++++++++++++++++++++++++-
>>  lib/debug_locks.c                 |    2
>>  5 files changed, 1010 insertions(+), 17 deletions(-)
>>  create mode 100644 Documentation/ww-mutex-design.txt
>>
>> diff --git a/Documentation/ww-mutex-design.txt b/Documentation/ww-mutex-design.txt
>> new file mode 100644
>> index 0000000..154bae3
>> --- /dev/null
>> +++ b/Documentation/ww-mutex-design.txt
>> @@ -0,0 +1,322 @@
>> +Wait/Wound Deadlock-Proof Mutex Design
>> +======================================
>> +
>> +Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
>> +
>> +Motivation for WW-Mutexes
>> +-------------------------
>> +
>> +GPU's do operations that commonly involve many buffers.  Those buffers
>> +can be shared across contexts/processes, exist in different memory
>> +domains (for example VRAM vs system memory), and so on.  And with
>> +PRIME / dmabuf, they can even be shared across devices.  So there are
>> +a handful of situations where the driver needs to wait for buffers to
>> +become ready.  If you think about this in terms of waiting on a buffer
>> +mutex for it to become available, this presents a problem because
>> +there is no way to guarantee that buffers appear in a execbuf/batch in
>> +the same order in all contexts.  That is directly under control of
>> +userspace, and a result of the sequence of GL calls that an application
>> +makes.       Which results in the potential for deadlock.  The problem gets
>> +more complex when you consider that the kernel may need to migrate the
>> +buffer(s) into VRAM before the GPU operates on the buffer(s), which
>> +may in turn require evicting some other buffers (and you don't want to
>> +evict other buffers which are already queued up to the GPU), but for a
>> +simplified understanding of the problem you can ignore this.
>> +
>> +The algorithm that TTM came up with for dealing with this problem is quite
>> +simple.  For each group of buffers (execbuf) that need to be locked, the caller
>> +would be assigned a unique reservation id/ticket, from a global counter.  In
>> +case of deadlock while locking all the buffers associated with a execbuf, the
>> +one with the lowest reservation ticket (i.e. the oldest task) wins, and the one
>> +with the higher reservation id (i.e. the younger task) unlocks all of the
>> +buffers that it has already locked, and then tries again.
>> +
>> +In the RDBMS literature this deadlock handling approach is called wait/wound:
>> +The older tasks waits until it can acquire the contended lock. The younger tasks
>> +needs to back off and drop all the locks it is currently holding, i.e. the
>> +younger task is wounded.
>> +
>> +Concepts
>> +--------
>> +
>> +Compared to normal mutexes two additional concepts/objects show up in the lock
>> +interface for w/w mutexes:
>> +
>> +Acquire context: To ensure eventual forward progress it is important the a task
>> +trying to acquire locks doesn't grab a new reservation id, but keeps the one it
>> +acquired when starting the lock acquisition. This ticket is stored in the
>> +acquire context. Furthermore the acquire context keeps track of debugging state
>> +to catch w/w mutex interface abuse.
>> +
>> +W/w class: In contrast to normal mutexes the lock class needs to be explicit for
>> +w/w mutexes, since it is required to initialize the acquire context.
>> +
>> +Furthermore there are three different classe of w/w lock acquire functions:
>> +- Normal lock acquisition with a context, using ww_mutex_lock
>> +- Slowpath lock acquisition on the contending lock, used by the wounded task
>> +  after having dropped all already acquired locks. These functions have the
>> +  _slow postfix.
>
> See below, I don't see the need for this interface.

I think it helps the code clarity to have special slowpath locking
functions. But it also helps with interface safety:
- __must_check int vs. void returns values sign up gcc to help check
for correct usage. Of course the first locking operation can't really
fail, but since those usually happen in a loop this shouldn't ever
hurt. At least the examples all need to check the return value to be
correct.
- _slow functions can check whether all acquire locks have been
released and whether the caller is indeed blocking on the contending
lock. Not doing so could either result in needless spinning instead of
blocking (when blocking on the wrong lock) or in deadlocks (when not
dropping all acquired).

Together with the debug patch to forcefully go through the slowpath
this should catch all interface abuse. Dropping the _slow functions
would rid us of this nice safety net.

>> +- Functions to only acquire a single w/w mutex, which results in the exact same
>> +  semantics as a normal mutex. These functions have the _single postfix.
>
> This is missing rationale.

Again it signs up gcc with the __must_check int vs. void return
values. In addition you don't need to set up a ww_acquire_ctx for the
single version. Since gem/ttm has quite a few interfaces to check the
status of buffer objects we have a lot of places that only acquire the
lock for a single object. So I think the added interface scope is
worth the upside of simpler code in ww mutex users.

>> +
>> +Of course, all the usual variants for handling wake-ups due to signals are also
>> +provided.
>> +
>> +Usage
>> +-----
>> +
>> +Three different ways to acquire locks within the same w/w class. Common
>> +definitions for methods 1&2.
>> +
>> +static DEFINE_WW_CLASS(ww_class);
>> +
>> +struct obj {
>> +     sct ww_mutex lock;
>> +     /* obj data */
>> +};
>> +
>> +struct obj_entry {
>> +     struct list_head *list;
>> +     struct obj *obj;
>> +};
>> +
>> +Method 1, using a list in execbuf->buffers that's not allowed to be reordered.
>> +This is useful if a list of required objects is already tracked somewhere.
>> +Furthermore the lock helper can use propagate the -EALREADY return code back to
>> +the caller as a signal that an object is twice on the list. This is useful if
>> +the list is constructed from userspace input and the ABI requires userspace to
>> +no have duplicate entries (e.g. for a gpu commandbuffer submission ioctl).
>> +
>> +int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
>> +{
>> +     struct obj *res_obj = NULL;
>> +     struct obj_entry *contended_entry = NULL;
>> +     struct obj_entry *entry;
>> +
>> +     ww_acquire_init(ctx, &ww_class);
>> +
>> +retry:
>> +     list_for_each_entry (list, entry) {
>> +             if (entry == res_obj) {
>> +                     res_obj = NULL;
>> +                     continue;
>> +             }
>> +             ret = ww_mutex_lock(&entry->obj->lock, ctx);
>> +             if (ret < 0) {
>> +                     contended_obj = entry;
>> +                     goto err;
>> +             }
>> +     }
>> +
>> +     ww_acquire_done(ctx);
>> +     return 0;
>> +
>> +err:
>> +     list_for_each_entry_continue_reverse (list, contended_entry, entry)
>> +             ww_mutex_unlock(&entry->obj->lock);
>> +
>> +     if (res_obj)
>> +             ww_mutex_unlock(&res_obj->lock);
>> +
>> +     if (ret == -EDEADLK) {
>> +             /* we lost out in a seqno race, lock and retry.. */
>> +             ww_mutex_lock_slow(&contended_entry->obj->lock, ctx);
>
> I missing the need for ww_mutex_lock_slow(). AFAICT we should be able to tell
> its the first lock in the ctx and thus we cannot possibly deadlock.

See above, it's only for better debugging checks and (imho) clearer
code in the ww mutex users.

>> +             res_obj = contended_entry->obj;
>> +             goto retry;
>> +     }
>> +     ww_acquire_fini(ctx);
>> +
>> +     return ret;
>> +}
>> +
>
> ... you certainly went all out on documentation.

Thanks ;-) And I think it was a good exercise in clarfying the details
of the interfaces and especially checking whether full debugging will
catch all the MUST and MUST NOT case. Like I've mentioned in the docs
somewhere I'm fairly convinced that the only untested ww mutex user
bug is not taking all required locks, everything else should be fully
covered.

Cheers, Daniel
--
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
Maarten Lankhorst May 22, 2013, 5:24 p.m. UTC | #8
Hey,

Op 22-05-13 18:18, Peter Zijlstra schreef:
> On Wed, May 22, 2013 at 01:18:14PM +0200, Maarten Lankhorst wrote:
>
> Lacking the actual msg atm, I'm going to paste in here...
Thanks for taking the time to review.
>> Subject: [PATCH v3 2/3] mutex: add support for wound/wait style locks, v3
>> From: Maarten Lankhorst <maarten.lankhorst@xxxxxxxxxxxxx>
>>
>> Changes since RFC patch v1:
>>  - Updated to use atomic_long instead of atomic, since the reservation_id was a long.
>>  - added mutex_reserve_lock_slow and mutex_reserve_lock_intr_slow
>>  - removed mutex_locked_set_reservation_id (or w/e it was called)
>> Changes since RFC patch v2:
>>  - remove use of __mutex_lock_retval_arg, add warnings when using wrong combination of
>>    mutex_(,reserve_)lock/unlock.
>> Changes since v1:
>>  - Add __always_inline to __mutex_lock_common, otherwise reservation paths can be
>>    triggered from normal locks, because __builtin_constant_p might evaluate to false
>>    for the constant 0 in that case. Tests for this have been added in the next patch.
>>  - Updated documentation slightly.
>> Changes since v2:
>>  - Renamed everything to ww_mutex. (mlankhorst)
>>  - Added ww_acquire_ctx and ww_class. (mlankhorst)
>>  - Added a lot of checks for wrong api usage. (mlankhorst)
>>  - Documentation updates. (danvet)
>>
>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@xxxxxxxxxxxxx>
>> Signed-off-by: Daniel Vetter <daniel.vetter@xxxxxxxx>
>> ---
>>  Documentation/ww-mutex-design.txt |  322 +++++++++++++++++++++++++++
>>  include/linux/mutex-debug.h       |    1 
>>  include/linux/mutex.h             |  257 +++++++++++++++++++++
>>  kernel/mutex.c                    |  445 ++++++++++++++++++++++++++++++++++++-
>>  lib/debug_locks.c                 |    2 
>>  5 files changed, 1010 insertions(+), 17 deletions(-)
>>  create mode 100644 Documentation/ww-mutex-design.txt
>>
>> diff --git a/Documentation/ww-mutex-design.txt b/Documentation/ww-mutex-design.txt
>> new file mode 100644
>> index 0000000..154bae3
>> --- /dev/null
>> +++ b/Documentation/ww-mutex-design.txt
>> @@ -0,0 +1,322 @@
>> +Wait/Wound Deadlock-Proof Mutex Design
>> +======================================
>> +
>> +Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
>> +
>> +Motivation for WW-Mutexes
>> +-------------------------
>> +
>> +GPU's do operations that commonly involve many buffers.  Those buffers
>> +can be shared across contexts/processes, exist in different memory
>> +domains (for example VRAM vs system memory), and so on.  And with
>> +PRIME / dmabuf, they can even be shared across devices.  So there are
>> +a handful of situations where the driver needs to wait for buffers to
>> +become ready.  If you think about this in terms of waiting on a buffer
>> +mutex for it to become available, this presents a problem because
>> +there is no way to guarantee that buffers appear in a execbuf/batch in
>> +the same order in all contexts.  That is directly under control of
>> +userspace, and a result of the sequence of GL calls that an application
>> +makes.	Which results in the potential for deadlock.  The problem gets
>> +more complex when you consider that the kernel may need to migrate the
>> +buffer(s) into VRAM before the GPU operates on the buffer(s), which
>> +may in turn require evicting some other buffers (and you don't want to
>> +evict other buffers which are already queued up to the GPU), but for a
>> +simplified understanding of the problem you can ignore this.
>> +
>> +The algorithm that TTM came up with for dealing with this problem is quite
>> +simple.  For each group of buffers (execbuf) that need to be locked, the caller
>> +would be assigned a unique reservation id/ticket, from a global counter.  In
>> +case of deadlock while locking all the buffers associated with a execbuf, the
>> +one with the lowest reservation ticket (i.e. the oldest task) wins, and the one
>> +with the higher reservation id (i.e. the younger task) unlocks all of the
>> +buffers that it has already locked, and then tries again.
>> +
>> +In the RDBMS literature this deadlock handling approach is called wait/wound:
>> +The older tasks waits until it can acquire the contended lock. The younger tasks
>> +needs to back off and drop all the locks it is currently holding, i.e. the
>> +younger task is wounded.
>> +
>> +Concepts
>> +--------
>> +
>> +Compared to normal mutexes two additional concepts/objects show up in the lock
>> +interface for w/w mutexes:
>> +
>> +Acquire context: To ensure eventual forward progress it is important the a task
>> +trying to acquire locks doesn't grab a new reservation id, but keeps the one it
>> +acquired when starting the lock acquisition. This ticket is stored in the
>> +acquire context. Furthermore the acquire context keeps track of debugging state
>> +to catch w/w mutex interface abuse.
>> +
>> +W/w class: In contrast to normal mutexes the lock class needs to be explicit for
>> +w/w mutexes, since it is required to initialize the acquire context.
>> +
>> +Furthermore there are three different classe of w/w lock acquire functions:
>> +- Normal lock acquisition with a context, using ww_mutex_lock
>> +- Slowpath lock acquisition on the contending lock, used by the wounded task
>> +  after having dropped all already acquired locks. These functions have the
>> +  _slow postfix.
> See below, I don't see the need for this interface.
>
>> +- Functions to only acquire a single w/w mutex, which results in the exact same
>> +  semantics as a normal mutex. These functions have the _single postfix.
> This is missing rationale.
trylock_single is useful when iterating over a list, and you want to evict a bo, but only the first one that can be acquired.
lock_single is useful when only a single bo needs to be acquired, for example to lock a buffer during mmap.

>> +
>> +Of course, all the usual variants for handling wake-ups due to signals are also
>> +provided.
>> +
>> +Usage
>> +-----
>> +
>> +Three different ways to acquire locks within the same w/w class. Common
>> +definitions for methods 1&2.
>> +
>> +static DEFINE_WW_CLASS(ww_class);
>> +
>> +struct obj {
>> +	sct ww_mutex lock;
>> +	/* obj data */
>> +};
>> +
>> +struct obj_entry {
>> +	struct list_head *list;
>> +	struct obj *obj;
>> +};
>> +
>> +Method 1, using a list in execbuf->buffers that's not allowed to be reordered.
>> +This is useful if a list of required objects is already tracked somewhere.
>> +Furthermore the lock helper can use propagate the -EALREADY return code back to
>> +the caller as a signal that an object is twice on the list. This is useful if
>> +the list is constructed from userspace input and the ABI requires userspace to
>> +no have duplicate entries (e.g. for a gpu commandbuffer submission ioctl).
>> +
>> +int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
>> +{
>> +	struct obj *res_obj = NULL;
>> +	struct obj_entry *contended_entry = NULL;
>> +	struct obj_entry *entry;
>> +
>> +	ww_acquire_init(ctx, &ww_class);
>> +
>> +retry:
>> +	list_for_each_entry (list, entry) {
>> +		if (entry == res_obj) {
>> +			res_obj = NULL;
>> +			continue;
>> +		}
>> +		ret = ww_mutex_lock(&entry->obj->lock, ctx);
>> +		if (ret < 0) {
>> +			contended_obj = entry;
>> +			goto err;
>> +		}
>> +	}
>> +
>> +	ww_acquire_done(ctx);
>> +	return 0;
>> +
>> +err:
>> +	list_for_each_entry_continue_reverse (list, contended_entry, entry)
>> +		ww_mutex_unlock(&entry->obj->lock);
>> +
>> +	if (res_obj)
>> +		ww_mutex_unlock(&res_obj->lock);
>> +
>> +	if (ret == -EDEADLK) {
>> +		/* we lost out in a seqno race, lock and retry.. */
>> +		ww_mutex_lock_slow(&contended_entry->obj->lock, ctx);
> I missing the need for ww_mutex_lock_slow(). AFAICT we should be able to tell
> its the first lock in the ctx and thus we cannot possibly deadlock.
Theoretically true, but that would require always setting ctx->acquired correctly.
Plus that would weaken the checks. Without ww_mutex_lock_slow you can not
say for sure all mutexes have been unlocked, and tell that what you say is really true.

>> +		res_obj = contended_entry->obj;
>> +		goto retry;
>> +	}
>> +	ww_acquire_fini(ctx);
>> +
>> +	return ret;
>> +}
>> +
> ... you certainly went all out on documentation.
>
>> diff --git a/include/linux/mutex-debug.h b/include/linux/mutex-debug.h
>> index 731d77d..4ac8b19 100644
>> --- a/include/linux/mutex-debug.h
>> +++ b/include/linux/mutex-debug.h
>> @@ -3,6 +3,7 @@
>>  
>>  #include <linux/linkage.h>
>>  #include <linux/lockdep.h>
>> +#include <linux/debug_locks.h>
>>  
>>  /*
>>   * Mutexes - debugging helpers:
>> diff --git a/include/linux/mutex.h b/include/linux/mutex.h
>> index 9121595..004f863 100644
>> --- a/include/linux/mutex.h
>> +++ b/include/linux/mutex.h
>> @@ -74,6 +74,35 @@ struct mutex_waiter {
>>  #endif
>>  };
>>  
>> +struct ww_class {
>> +	atomic_long_t stamp;
>> +	struct lock_class_key acquire_key;
>> +	struct lock_class_key mutex_key;
>> +	const char *acquire_name;
>> +	const char *mutex_name;
>> +};
>> +
>> +struct ww_acquire_ctx {
>> +	struct task_struct *task;
>> +	unsigned long stamp;
>> +#ifdef CONFIG_DEBUG_MUTEXES
>> +	unsigned acquired, done_acquire;
>> +	struct ww_class *ww_class;
>> +	struct ww_mutex *contending_lock;
>> +#endif
>> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
>> +	struct lockdep_map dep_map;
>> +#endif
>> +};
>> +
>> +struct ww_mutex {
>> +	struct mutex base;
>> +	struct ww_acquire_ctx *ctx;
>> +#ifdef CONFIG_DEBUG_MUTEXES
>> +	struct ww_class *ww_class;
>> +#endif
>> +};
>> +
>
>> @@ -167,6 +236,192 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
>>   */
>>  extern int mutex_trylock(struct mutex *lock);
>>  extern void mutex_unlock(struct mutex *lock);
>> +
>> +/**
>> + * ww_acquire_init - initialize a w/w acquire context
>> + * @ctx: w/w acquire context to initialize
>> + * @ww_class: w/w class of the context
>> + *
>> + * Initializes an context to acquire multiple mutexes of the given w/w class.
>> + *
>> + * Context-based w/w mutex acquiring can be done in any order whatsoever within
>> + * a given lock class. Deadlocks will be detected and handled with the
>> + * wait/wound logic.
>> + *
>> + * Mixing of context-based w/w mutex acquiring and single w/w mutex locking can
>> + * result in undetected deadlocks and is so forbidden. Mixing different contexts
>> + * for the same w/w class when acquiring mutexes can also result in undetected
>> + * deadlocks, and is hence also forbidden.
>> + *
>> + * Nesting of acquire contexts for _different_ w/w classes is possible, subject
>> + * to the usual locking rules between different lock classes.
>> + *
>> + * An acquire context must be release by the same task before the memory is
>> + * freed with ww_acquire_fini. It is recommended to allocate the context itself
>> + * on the stack.
>> + */
>> +static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
>> +				   struct ww_class *ww_class)
>> +{
>> +	ctx->task = current;
>> +	do {
>> +		ctx->stamp = atomic_long_inc_return(&ww_class->stamp);
>> +	} while (unlikely(!ctx->stamp));
> I suppose we'll figure something out when this becomes a bottleneck. Ideally
> we'd do something like:
>
>  ctx->stamp = local_clock();
>
> but for now we cannot guarantee that's not jiffies, and I suppose that's a tad
> too coarse to work for this.
This might mess up when 2 cores happen to return exactly the same time, how do you choose a winner in that case?
EDIT: Using pointer address like you suggested below is fine with me. ctx pointer would be static enough.

> Also, why is 0 special?
Oops, 0 is no longer special.

I used to set the samp directly on the lock, so 0 used to mean no ctx set.
>> +#ifdef CONFIG_DEBUG_MUTEXES
>> +	ctx->ww_class = ww_class;
>> +	ctx->acquired = ctx->done_acquire = 0;
>> +	ctx->contending_lock = NULL;
>> +#endif
>> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
>> +	debug_check_no_locks_freed((void *)ctx, sizeof(*ctx));
>> +	lockdep_init_map(&ctx->dep_map, ww_class->acquire_name,
>> +			 &ww_class->acquire_key, 0);
>> +	mutex_acquire(&ctx->dep_map, 0, 0, _RET_IP_);
>> +#endif
>> +}
>> +/**
>> + * ww_mutex_trylock_single - tries to acquire the w/w mutex without acquire context
>> + * @lock: mutex to lock
>> + *
>> + * Trylocks a mutex without acquire context, so no deadlock detection is
>> + * possible. Returns 0 if the mutex has been acquired.
>> + *
>> + * Unlocking the mutex must happen with a call to ww_mutex_unlock_single.
>> + */
>> +static inline int __must_check ww_mutex_trylock_single(struct ww_mutex *lock)
>> +{
>> +	return mutex_trylock(&lock->base);
>> +}
> trylocks can never deadlock they don't block per definition, I don't see the
> point of the _single() thing here.
I called it single because they weren't annotated into any ctx. I can drop the _single suffix though,
but you'd still need to unlock with unlock_single, or we need to remove that distinction altogether,
lose a few lockdep checks and only have a one unlock function.

>> +/**
>> + * ww_mutex_lock_single - acquire the w/w mutex without acquire context
>> + * @lock: mutex to lock
>> + *
>> + * Locks a mutex without acquire context, so no deadlock detection is
>> + * possible.
>> + *
>> + * Unlocking the mutex must happen with a call to ww_mutex_unlock_single.
>> + */
>> +static inline void ww_mutex_lock_single(struct ww_mutex *lock)
>> +{
>> +	mutex_lock(&lock->base);
>> +}
> as per the above, I'm missing the rationale for having this.
>
>> diff --git a/kernel/mutex.c b/kernel/mutex.c
>> index 84a5f07..66807c7 100644
>> --- a/kernel/mutex.c
>> +++ b/kernel/mutex.c
>> @@ -127,16 +127,156 @@ void __sched mutex_unlock(struct mutex *lock)
>>  
>>  EXPORT_SYMBOL(mutex_unlock);
>>  
>> +/**
>> + * ww_mutex_unlock - release the w/w mutex
>> + * @lock: the mutex to be released
>> + *
>> + * Unlock a mutex that has been locked by this task previously
>> + * with ww_mutex_lock* using an acquire context. It is forbidden to release the
>> + * locks after releasing the acquire context.
>> + *
>> + * This function must not be used in interrupt context. Unlocking
>> + * of a unlocked mutex is not allowed.
>> + *
>> + * Note that locks acquired with one of the ww_mutex_lock*single variant must be
>> + * unlocked with ww_mutex_unlock_single.
>> + */
>> +void __sched ww_mutex_unlock(struct ww_mutex *lock)
>> +{
>> +	/*
>> +	 * The unlocking fastpath is the 0->1 transition from 'locked'
>> +	 * into 'unlocked' state:
>> +	 */
>> +#ifdef CONFIG_DEBUG_MUTEXES
>> +	DEBUG_LOCKS_WARN_ON(!lock->ctx);
>> +	if (lock->ctx) {
>> +		DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
>> +		if (lock->ctx->acquired > 0)
>> +			lock->ctx->acquired--;
>> +	}
>> +#endif
>> +	lock->ctx = NULL;
> barriers should always have a comment explaining the exact ordering and the
> pairing barrier's location.
>
>> +	smp_mb__before_atomic_inc();
>> +
>> +#ifndef CONFIG_DEBUG_MUTEXES
>> +	/*
>> +	 * When debugging is enabled we must not clear the owner before time,
>> +	 * the slow path will always be taken, and that clears the owner field
>> +	 * after verifying that it was indeed current.
>> +	 */
>> +	mutex_clear_owner(&lock->base);
>> +#endif
>> +	__mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath);
>> +}
>> +EXPORT_SYMBOL(ww_mutex_unlock);
>> +
>> +static inline int __sched
>> +__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
>> +{
>> +	struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
>> +	struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
>> +
>> +	if (!hold_ctx)
>> +		return 0;
>> +
>> +	if (unlikely(ctx->stamp == hold_ctx->stamp))
>> +		return -EALREADY;
> Why compare stamps? I expected: ctx == hold_ctx here.
Because the check just below it compares stamps too, having the same check
would make it clear that when ctx->stamp - hold_ctx->stamp == 0 is not expected.

>> +
>> +	if (unlikely(ctx->stamp - hold_ctx->stamp <= LONG_MAX)) {
> Why not simply write: ctx->stamp > hold_ctx->stamp ?
To handle the wraparound case on 32-bits?
> If we need to deal with equal stamps from different contexts we could tie-break
> based on ctx address or so, but seeing its a global counter from the class,
> that shouldn't happen for now.
Sounds good enough to me.

>> +#ifdef CONFIG_DEBUG_MUTEXES
>> +		DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
>> +		ctx->contending_lock = ww;
>> +#endif
>> +		return -EDEADLK;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
>> +						   struct ww_acquire_ctx *ww_ctx,
>> +						   bool ww_slow)
>> +{
>> +#ifdef CONFIG_DEBUG_MUTEXES
>> +	/*
>> +	 * If this WARN_ON triggers, you used mutex_lock to acquire,
>> +	 * but released with ww_mutex_unlock in this call.
>> +	 */
>> +	DEBUG_LOCKS_WARN_ON(ww->ctx);
>> +
>> +	/*
>> +	 * Not quite done after ww_acquire_done() ?
>> +	 */
>> +	DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
>> +
>> +	if (ww_slow) {
> s/ww_slow/!ww_ctx->acquired/
>
>> +		DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
>> +		ww_ctx->contending_lock = NULL;
>> +	} else
>> +		DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
>> +
>> +
>> +	/*
>> +	 * Naughty, using a different class can lead to undefined behavior!
>> +	 */
>> +	DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
>> +
>> +	if (ww_slow)
>> +		DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
>> +
>> +	ww_ctx->acquired++;
>> +#endif
>> +}
>> +
>> +/*
>> + * after acquiring lock with fastpath or when we lost out in contested
>> + * slowpath, set ctx and wake up any waiters so they can recheck.
>> + *
>> + * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
>> + * as the fastpath and opportunistic spinning are disabled in that case.
>> + */
>> +static __always_inline void
>> +ww_mutex_set_context_fastpath(struct ww_mutex *lock,
>> +			       struct ww_acquire_ctx *ctx)
>> +{
>> +	unsigned long flags;
>> +	struct mutex_waiter *cur;
>> +
>> +	ww_mutex_lock_acquired(lock, ctx, false);
>> +
>> +	lock->ctx = ctx;
>  missing comment
Yeah, this was patched up as per danvet's command, moved the smp_mb__after upwards, and added a full smp_mb after setting lock->ctx.
>> +	smp_mb__after_atomic_dec();
>> +
>> +	/*
>> +	 * Check if lock is contended, if not there is nobody to wake up
>> +	 */
>> +	if (likely(atomic_read(&lock->base.count) == 0))
>> +		return;
>> +
>> +	/*
>> +	 * Uh oh, we raced in fastpath, wake up everyone in this case,
>> +	 * so they can see the new ctx
>> +	 */
>> +	spin_lock_mutex(&lock->base.wait_lock, flags);
>> +	list_for_each_entry(cur, &lock->base.wait_list, list) {
>> +		debug_mutex_wake_waiter(&lock->base, cur);
>> +		wake_up_process(cur->task);
>> +	}
>> +	spin_unlock_mutex(&lock->base.wait_lock, flags);
>> +}
>> +
>>  /*
>>   * Lock a mutex (possibly interruptible), slowpath:
>>   */
>> -static inline int __sched
>> +static __always_inline int __sched
>>  __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
>> -		    struct lockdep_map *nest_lock, unsigned long ip)
>> +		    struct lockdep_map *nest_lock, unsigned long ip,
>> +		    struct ww_acquire_ctx *ww_ctx, bool ww_slow)
>>  {
>>  	struct task_struct *task = current;
>>  	struct mutex_waiter waiter;
>>  	unsigned long flags;
>> +	int ret;
>>  
>>  	preempt_disable();
>>  	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
>> @@ -163,6 +303,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
>>  	for (;;) {
>>  		struct task_struct *owner;
>>  
>> +		if (!__builtin_constant_p(ww_ctx == NULL) && !ww_slow) {
> Since we _know_ ww_ctx isn't NULL, we can trivially do: s/ww_slow/!ww_ctx->acquired/
>
>> +			struct ww_mutex *ww;
>> +
>> +			ww = container_of(lock, struct ww_mutex, base);
>> +			if (ACCESS_ONCE(ww->ctx))
> What's the point of this ACCESS_ONCE()?
Break out of the spin_on_owner loop. Without taking spin_lock_mutex there is no guarantee that the
contents of ww->ctx are valid or sane in any way, so there's no way to check if we ought to back off or not.


>> +				break;
>> +		}
>> +
>>  		/*
>>  		 * If there's an owner, wait for it to either
>>  		 * release the lock or go to sleep.
>> @@ -173,6 +321,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
>>  
>>  		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
>>  			lock_acquired(&lock->dep_map, ip);
> Should this not also have a __builtin_constant_p(ww_ctx == NULL) ?
ww_slow should not be set to non-zero when ww_ctx == NULL.

>> +			if (ww_slow) {
>> +				struct ww_mutex *ww;
>> +				ww = container_of(lock, struct ww_mutex, base);
>> +
>> +				ww_mutex_set_context_fastpath(ww, ww_ctx);
>> +			}
>> +
>>  			mutex_set_owner(lock);
>>  			preempt_enable();
>>  			return 0;
>> @@ -228,15 +383,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
>>  		 * TASK_UNINTERRUPTIBLE case.)
>>  		 */
>>  		if (unlikely(signal_pending_state(state, task))) {
>> -			mutex_remove_waiter(lock, &waiter,
>> -					    task_thread_info(task));
>> -			mutex_release(&lock->dep_map, 1, ip);
>> -			spin_unlock_mutex(&lock->wait_lock, flags);
>> +			ret = -EINTR;
>> +			goto err;
>> +		}
>>  
>> -			debug_mutex_free_waiter(&waiter);
>> -			preempt_enable();
>> -			return -EINTR;
>> +		if (!__builtin_constant_p(ww_ctx == NULL) && !ww_slow) {
>> +			ret = __mutex_lock_check_stamp(lock, ww_ctx);
>> +			if (ret)
>> +				goto err;
>>  		}
>> +
>>  		__set_task_state(task, state);
>>  
>>  		/* didn't get the lock, go to sleep: */
>> @@ -251,6 +407,30 @@ done:
>>  	mutex_remove_waiter(lock, &waiter, current_thread_info());
>>  	mutex_set_owner(lock);
>>  
>> +	if (!__builtin_constant_p(ww_ctx == NULL)) {
>> +		struct ww_mutex *ww = container_of(lock,
>> +						      struct ww_mutex,
>> +						      base);
>> +		struct mutex_waiter *cur;
>> +
>> +		/*
>> +		 * This branch gets optimized out for the common case,
>> +		 * and is only important for ww_mutex_lock.
>> +		 */
>> +
>> +		ww_mutex_lock_acquired(ww, ww_ctx, ww_slow);
>> +		ww->ctx = ww_ctx;
>> +
>> +		/*
>> +		 * Give any possible sleeping processes the chance to wake up,
>> +		 * so they can recheck if they have to back off.
>> +		 */
>> +		list_for_each_entry(cur, &lock->wait_list, list) {
>> +			debug_mutex_wake_waiter(lock, cur);
>> +			wake_up_process(cur->task);
>> +		}
>> +	}
>> +
>>  	/* set it to 0 if there are no waiters left: */
>>  	if (likely(list_empty(&lock->wait_list)))
>>  		atomic_set(&lock->count, 0);
>> @@ -261,6 +441,14 @@ done:
>>  	preempt_enable();
>>  
>>  	return 0;
>> +
>> +err:
>> +	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
>> +	spin_unlock_mutex(&lock->wait_lock, flags);
>> +	debug_mutex_free_waiter(&waiter);
>> +	mutex_release(&lock->dep_map, 1, ip);
>> +	preempt_enable();
>> +	return ret;
>>  }
>>  
>>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>> @@ -268,7 +456,8 @@ void __sched
>>  mutex_lock_nested(struct mutex *lock, unsigned int subclass)
>>  {
>>  	might_sleep();
>> -	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
>> +	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
>> +			    subclass, NULL, _RET_IP_, 0, 0);
>>  }
> The pendant in me has to tell you 4x that NULL != 0 :-)
>
>> +EXPORT_SYMBOL_GPL(ww_mutex_lock);
>> +EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
>> +EXPORT_SYMBOL_GPL(ww_mutex_lock_slow);
>> +EXPORT_SYMBOL_GPL(ww_mutex_lock_slow_interruptible);
> Now having to do the _slow stuff saves lines and interface complexity!
It will also reduce useful debugging information returned a little. Danvet answered it better than me.

>> @@ -401,20 +738,39 @@ __mutex_lock_slowpath(atomic_t *lock_count)
>>  {
>>  	struct mutex *lock = container_of(lock_count, struct mutex, count);
>>  
>> -	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
>> +	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
>> +			    NULL, _RET_IP_, 0, 0);
>>  }
>>  
>>  static noinline int __sched
>>  __mutex_lock_killable_slowpath(struct mutex *lock)
>>  {
>> -	return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
>> +	return __mutex_lock_common(lock, TASK_KILLABLE, 0,
>> +				   NULL, _RET_IP_, 0, 0);
>>  }
>>  
>>  static noinline int __sched
>>  __mutex_lock_interruptible_slowpath(struct mutex *lock)
>>  {
>> -	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
>> +	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
>> +				   NULL, _RET_IP_, 0, 0);
>>  }
> A few more cases where NULL != 0 :-)

But yeah, so open questions..

1. Do you still want to get rid of the _single variants, even though doing so would slightly reduce debugging?
2. Do you really want to drop the *_slow variants?
Doing so might reduce debugging slightly. I like method #2 in ww-mutex-design.txt, it makes it very clear why you
would handle the *_slow case differently anyway.
3. is a smp_mb needed to serialize lock->ctx with the atomic_read?

(mutex locked in fastpath, which is typically an atomic_dec operation)
smp_mb__after_atomic_dec();
lock->ctx = ..;
smp_mb();
if (atomic_read(lock->count) == 0) return;

feels a bit like overkill to me.

~Maarten
Maarten Lankhorst May 23, 2013, 9:13 a.m. UTC | #9
Op 22-05-13 19:24, Maarten Lankhorst schreef:
> Hey,
>
> Op 22-05-13 18:18, Peter Zijlstra schreef:
>> On Wed, May 22, 2013 at 01:18:14PM +0200, Maarten Lankhorst wrote:
>>
>> Lacking the actual msg atm, I'm going to paste in here...
> Thanks for taking the time to review.
>>> Subject: [PATCH v3 2/3] mutex: add support for wound/wait style locks, v3
>>> From: Maarten Lankhorst <maarten.lankhorst@xxxxxxxxxxxxx>
>>>
>>> Changes since RFC patch v1:
>>>  - Updated to use atomic_long instead of atomic, since the reservation_id was a long.
>>>  - added mutex_reserve_lock_slow and mutex_reserve_lock_intr_slow
>>>  - removed mutex_locked_set_reservation_id (or w/e it was called)
>>> Changes since RFC patch v2:
>>>  - remove use of __mutex_lock_retval_arg, add warnings when using wrong combination of
>>>    mutex_(,reserve_)lock/unlock.
>>> Changes since v1:
>>>  - Add __always_inline to __mutex_lock_common, otherwise reservation paths can be
>>>    triggered from normal locks, because __builtin_constant_p might evaluate to false
>>>    for the constant 0 in that case. Tests for this have been added in the next patch.
>>>  - Updated documentation slightly.
>>> Changes since v2:
>>>  - Renamed everything to ww_mutex. (mlankhorst)
>>>  - Added ww_acquire_ctx and ww_class. (mlankhorst)
>>>  - Added a lot of checks for wrong api usage. (mlankhorst)
>>>  - Documentation updates. (danvet)
>>>
>>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@xxxxxxxxxxxxx>
>>> Signed-off-by: Daniel Vetter <daniel.vetter@xxxxxxxx>
>>> ---
>>>  Documentation/ww-mutex-design.txt |  322 +++++++++++++++++++++++++++
>>>  include/linux/mutex-debug.h       |    1 
>>>  include/linux/mutex.h             |  257 +++++++++++++++++++++
>>>  kernel/mutex.c                    |  445 ++++++++++++++++++++++++++++++++++++-
>>>  lib/debug_locks.c                 |    2 
>>>  5 files changed, 1010 insertions(+), 17 deletions(-)
>>>  create mode 100644 Documentation/ww-mutex-design.txt
>>>
>>> diff --git a/Documentation/ww-mutex-design.txt b/Documentation/ww-mutex-design.txt
>>> new file mode 100644
>>> index 0000000..154bae3
>>> --- /dev/null
>>> +++ b/Documentation/ww-mutex-design.txt
>>> @@ -0,0 +1,322 @@
>>> +Wait/Wound Deadlock-Proof Mutex Design
>>> +======================================
>>> +
>>> +Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
>>> +
>>> +Motivation for WW-Mutexes
>>> +-------------------------
>>> +
>>> +GPU's do operations that commonly involve many buffers.  Those buffers
>>> +can be shared across contexts/processes, exist in different memory
>>> +domains (for example VRAM vs system memory), and so on.  And with
>>> +PRIME / dmabuf, they can even be shared across devices.  So there are
>>> +a handful of situations where the driver needs to wait for buffers to
>>> +become ready.  If you think about this in terms of waiting on a buffer
>>> +mutex for it to become available, this presents a problem because
>>> +there is no way to guarantee that buffers appear in a execbuf/batch in
>>> +the same order in all contexts.  That is directly under control of
>>> +userspace, and a result of the sequence of GL calls that an application
>>> +makes.	Which results in the potential for deadlock.  The problem gets
>>> +more complex when you consider that the kernel may need to migrate the
>>> +buffer(s) into VRAM before the GPU operates on the buffer(s), which
>>> +may in turn require evicting some other buffers (and you don't want to
>>> +evict other buffers which are already queued up to the GPU), but for a
>>> +simplified understanding of the problem you can ignore this.
>>> +
>>> +The algorithm that TTM came up with for dealing with this problem is quite
>>> +simple.  For each group of buffers (execbuf) that need to be locked, the caller
>>> +would be assigned a unique reservation id/ticket, from a global counter.  In
>>> +case of deadlock while locking all the buffers associated with a execbuf, the
>>> +one with the lowest reservation ticket (i.e. the oldest task) wins, and the one
>>> +with the higher reservation id (i.e. the younger task) unlocks all of the
>>> +buffers that it has already locked, and then tries again.
>>> +
>>> +In the RDBMS literature this deadlock handling approach is called wait/wound:
>>> +The older tasks waits until it can acquire the contended lock. The younger tasks
>>> +needs to back off and drop all the locks it is currently holding, i.e. the
>>> +younger task is wounded.
>>> +
>>> +Concepts
>>> +--------
>>> +
>>> +Compared to normal mutexes two additional concepts/objects show up in the lock
>>> +interface for w/w mutexes:
>>> +
>>> +Acquire context: To ensure eventual forward progress it is important the a task
>>> +trying to acquire locks doesn't grab a new reservation id, but keeps the one it
>>> +acquired when starting the lock acquisition. This ticket is stored in the
>>> +acquire context. Furthermore the acquire context keeps track of debugging state
>>> +to catch w/w mutex interface abuse.
>>> +
>>> +W/w class: In contrast to normal mutexes the lock class needs to be explicit for
>>> +w/w mutexes, since it is required to initialize the acquire context.
>>> +
>>> +Furthermore there are three different classe of w/w lock acquire functions:
>>> +- Normal lock acquisition with a context, using ww_mutex_lock
>>> +- Slowpath lock acquisition on the contending lock, used by the wounded task
>>> +  after having dropped all already acquired locks. These functions have the
>>> +  _slow postfix.
>> See below, I don't see the need for this interface.
>>
>>> +- Functions to only acquire a single w/w mutex, which results in the exact same
>>> +  semantics as a normal mutex. These functions have the _single postfix.
>> This is missing rationale.
> trylock_single is useful when iterating over a list, and you want to evict a bo, but only the first one that can be acquired.
> lock_single is useful when only a single bo needs to be acquired, for example to lock a buffer during mmap.
>
>>> +
>>> +Of course, all the usual variants for handling wake-ups due to signals are also
>>> +provided.
>>> +
>>> +Usage
>>> +-----
>>> +
>>> +Three different ways to acquire locks within the same w/w class. Common
>>> +definitions for methods 1&2.
>>> +
>>> +static DEFINE_WW_CLASS(ww_class);
>>> +
>>> +struct obj {
>>> +	sct ww_mutex lock;
>>> +	/* obj data */
>>> +};
>>> +
>>> +struct obj_entry {
>>> +	struct list_head *list;
>>> +	struct obj *obj;
>>> +};
>>> +
>>> +Method 1, using a list in execbuf->buffers that's not allowed to be reordered.
>>> +This is useful if a list of required objects is already tracked somewhere.
>>> +Furthermore the lock helper can use propagate the -EALREADY return code back to
>>> +the caller as a signal that an object is twice on the list. This is useful if
>>> +the list is constructed from userspace input and the ABI requires userspace to
>>> +no have duplicate entries (e.g. for a gpu commandbuffer submission ioctl).
>>> +
>>> +int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
>>> +{
>>> +	struct obj *res_obj = NULL;
>>> +	struct obj_entry *contended_entry = NULL;
>>> +	struct obj_entry *entry;
>>> +
>>> +	ww_acquire_init(ctx, &ww_class);
>>> +
>>> +retry:
>>> +	list_for_each_entry (list, entry) {
>>> +		if (entry == res_obj) {
>>> +			res_obj = NULL;
>>> +			continue;
>>> +		}
>>> +		ret = ww_mutex_lock(&entry->obj->lock, ctx);
>>> +		if (ret < 0) {
>>> +			contended_obj = entry;
>>> +			goto err;
>>> +		}
>>> +	}
>>> +
>>> +	ww_acquire_done(ctx);
>>> +	return 0;
>>> +
>>> +err:
>>> +	list_for_each_entry_continue_reverse (list, contended_entry, entry)
>>> +		ww_mutex_unlock(&entry->obj->lock);
>>> +
>>> +	if (res_obj)
>>> +		ww_mutex_unlock(&res_obj->lock);
>>> +
>>> +	if (ret == -EDEADLK) {
>>> +		/* we lost out in a seqno race, lock and retry.. */
>>> +		ww_mutex_lock_slow(&contended_entry->obj->lock, ctx);
>> I missing the need for ww_mutex_lock_slow(). AFAICT we should be able to tell
>> its the first lock in the ctx and thus we cannot possibly deadlock.
> Theoretically true, but that would require always setting ctx->acquired correctly.
> Plus that would weaken the checks. Without ww_mutex_lock_slow you can not
> say for sure all mutexes have been unlocked, and tell that what you say is really true.
>
>>> +		res_obj = contended_entry->obj;
>>> +		goto retry;
>>> +	}
>>> +	ww_acquire_fini(ctx);
>>> +
>>> +	return ret;
>>> +}
>>> +
>> ... you certainly went all out on documentation.
>>
>>> diff --git a/include/linux/mutex-debug.h b/include/linux/mutex-debug.h
>>> index 731d77d..4ac8b19 100644
>>> --- a/include/linux/mutex-debug.h
>>> +++ b/include/linux/mutex-debug.h
>>> @@ -3,6 +3,7 @@
>>>  
>>>  #include <linux/linkage.h>
>>>  #include <linux/lockdep.h>
>>> +#include <linux/debug_locks.h>
>>>  
>>>  /*
>>>   * Mutexes - debugging helpers:
>>> diff --git a/include/linux/mutex.h b/include/linux/mutex.h
>>> index 9121595..004f863 100644
>>> --- a/include/linux/mutex.h
>>> +++ b/include/linux/mutex.h
>>> @@ -74,6 +74,35 @@ struct mutex_waiter {
>>>  #endif
>>>  };
>>>  
>>> +struct ww_class {
>>> +	atomic_long_t stamp;
>>> +	struct lock_class_key acquire_key;
>>> +	struct lock_class_key mutex_key;
>>> +	const char *acquire_name;
>>> +	const char *mutex_name;
>>> +};
>>> +
>>> +struct ww_acquire_ctx {
>>> +	struct task_struct *task;
>>> +	unsigned long stamp;
>>> +#ifdef CONFIG_DEBUG_MUTEXES
>>> +	unsigned acquired, done_acquire;
>>> +	struct ww_class *ww_class;
>>> +	struct ww_mutex *contending_lock;
>>> +#endif
>>> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
>>> +	struct lockdep_map dep_map;
>>> +#endif
>>> +};
>>> +
>>> +struct ww_mutex {
>>> +	struct mutex base;
>>> +	struct ww_acquire_ctx *ctx;
>>> +#ifdef CONFIG_DEBUG_MUTEXES
>>> +	struct ww_class *ww_class;
>>> +#endif
>>> +};
>>> +
>>> @@ -167,6 +236,192 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
>>>   */
>>>  extern int mutex_trylock(struct mutex *lock);
>>>  extern void mutex_unlock(struct mutex *lock);
>>> +
>>> +/**
>>> + * ww_acquire_init - initialize a w/w acquire context
>>> + * @ctx: w/w acquire context to initialize
>>> + * @ww_class: w/w class of the context
>>> + *
>>> + * Initializes an context to acquire multiple mutexes of the given w/w class.
>>> + *
>>> + * Context-based w/w mutex acquiring can be done in any order whatsoever within
>>> + * a given lock class. Deadlocks will be detected and handled with the
>>> + * wait/wound logic.
>>> + *
>>> + * Mixing of context-based w/w mutex acquiring and single w/w mutex locking can
>>> + * result in undetected deadlocks and is so forbidden. Mixing different contexts
>>> + * for the same w/w class when acquiring mutexes can also result in undetected
>>> + * deadlocks, and is hence also forbidden.
>>> + *
>>> + * Nesting of acquire contexts for _different_ w/w classes is possible, subject
>>> + * to the usual locking rules between different lock classes.
>>> + *
>>> + * An acquire context must be release by the same task before the memory is
>>> + * freed with ww_acquire_fini. It is recommended to allocate the context itself
>>> + * on the stack.
>>> + */
>>> +static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
>>> +				   struct ww_class *ww_class)
>>> +{
>>> +	ctx->task = current;
>>> +	do {
>>> +		ctx->stamp = atomic_long_inc_return(&ww_class->stamp);
>>> +	} while (unlikely(!ctx->stamp));
>> I suppose we'll figure something out when this becomes a bottleneck. Ideally
>> we'd do something like:
>>
>>  ctx->stamp = local_clock();
>>
>> but for now we cannot guarantee that's not jiffies, and I suppose that's a tad
>> too coarse to work for this.
> This might mess up when 2 cores happen to return exactly the same time, how do you choose a winner in that case?
> EDIT: Using pointer address like you suggested below is fine with me. ctx pointer would be static enough.
>
>> Also, why is 0 special?
> Oops, 0 is no longer special.
>
> I used to set the samp directly on the lock, so 0 used to mean no ctx set.
>>> +#ifdef CONFIG_DEBUG_MUTEXES
>>> +	ctx->ww_class = ww_class;
>>> +	ctx->acquired = ctx->done_acquire = 0;
>>> +	ctx->contending_lock = NULL;
>>> +#endif
>>> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
>>> +	debug_check_no_locks_freed((void *)ctx, sizeof(*ctx));
>>> +	lockdep_init_map(&ctx->dep_map, ww_class->acquire_name,
>>> +			 &ww_class->acquire_key, 0);
>>> +	mutex_acquire(&ctx->dep_map, 0, 0, _RET_IP_);
>>> +#endif
>>> +}
>>> +/**
>>> + * ww_mutex_trylock_single - tries to acquire the w/w mutex without acquire context
>>> + * @lock: mutex to lock
>>> + *
>>> + * Trylocks a mutex without acquire context, so no deadlock detection is
>>> + * possible. Returns 0 if the mutex has been acquired.
>>> + *
>>> + * Unlocking the mutex must happen with a call to ww_mutex_unlock_single.
>>> + */
>>> +static inline int __must_check ww_mutex_trylock_single(struct ww_mutex *lock)
>>> +{
>>> +	return mutex_trylock(&lock->base);
>>> +}
>> trylocks can never deadlock they don't block per definition, I don't see the
>> point of the _single() thing here.
> I called it single because they weren't annotated into any ctx. I can drop the _single suffix though,
> but you'd still need to unlock with unlock_single, or we need to remove that distinction altogether,
> lose a few lockdep checks and only have a one unlock function.
>
>>> +/**
>>> + * ww_mutex_lock_single - acquire the w/w mutex without acquire context
>>> + * @lock: mutex to lock
>>> + *
>>> + * Locks a mutex without acquire context, so no deadlock detection is
>>> + * possible.
>>> + *
>>> + * Unlocking the mutex must happen with a call to ww_mutex_unlock_single.
>>> + */
>>> +static inline void ww_mutex_lock_single(struct ww_mutex *lock)
>>> +{
>>> +	mutex_lock(&lock->base);
>>> +}
>> as per the above, I'm missing the rationale for having this.
>>
>>> diff --git a/kernel/mutex.c b/kernel/mutex.c
>>> index 84a5f07..66807c7 100644
>>> --- a/kernel/mutex.c
>>> +++ b/kernel/mutex.c
>>> @@ -127,16 +127,156 @@ void __sched mutex_unlock(struct mutex *lock)
>>>  
>>>  EXPORT_SYMBOL(mutex_unlock);
>>>  
>>> +/**
>>> + * ww_mutex_unlock - release the w/w mutex
>>> + * @lock: the mutex to be released
>>> + *
>>> + * Unlock a mutex that has been locked by this task previously
>>> + * with ww_mutex_lock* using an acquire context. It is forbidden to release the
>>> + * locks after releasing the acquire context.
>>> + *
>>> + * This function must not be used in interrupt context. Unlocking
>>> + * of a unlocked mutex is not allowed.
>>> + *
>>> + * Note that locks acquired with one of the ww_mutex_lock*single variant must be
>>> + * unlocked with ww_mutex_unlock_single.
>>> + */
>>> +void __sched ww_mutex_unlock(struct ww_mutex *lock)
>>> +{
>>> +	/*
>>> +	 * The unlocking fastpath is the 0->1 transition from 'locked'
>>> +	 * into 'unlocked' state:
>>> +	 */
>>> +#ifdef CONFIG_DEBUG_MUTEXES
>>> +	DEBUG_LOCKS_WARN_ON(!lock->ctx);
>>> +	if (lock->ctx) {
>>> +		DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
>>> +		if (lock->ctx->acquired > 0)
>>> +			lock->ctx->acquired--;
>>> +	}
>>> +#endif
>>> +	lock->ctx = NULL;
>> barriers should always have a comment explaining the exact ordering and the
>> pairing barrier's location.
>>
>>> +	smp_mb__before_atomic_inc();
>>> +
>>> +#ifndef CONFIG_DEBUG_MUTEXES
>>> +	/*
>>> +	 * When debugging is enabled we must not clear the owner before time,
>>> +	 * the slow path will always be taken, and that clears the owner field
>>> +	 * after verifying that it was indeed current.
>>> +	 */
>>> +	mutex_clear_owner(&lock->base);
>>> +#endif
>>> +	__mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath);
>>> +}
>>> +EXPORT_SYMBOL(ww_mutex_unlock);
>>> +
>>> +static inline int __sched
>>> +__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
>>> +{
>>> +	struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
>>> +	struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
>>> +
>>> +	if (!hold_ctx)
>>> +		return 0;
>>> +
>>> +	if (unlikely(ctx->stamp == hold_ctx->stamp))
>>> +		return -EALREADY;
>> Why compare stamps? I expected: ctx == hold_ctx here.
> Because the check just below it compares stamps too, having the same check
> would make it clear that when ctx->stamp - hold_ctx->stamp == 0 is not expected.
>
>>> +
>>> +	if (unlikely(ctx->stamp - hold_ctx->stamp <= LONG_MAX)) {
>> Why not simply write: ctx->stamp > hold_ctx->stamp ?
> To handle the wraparound case on 32-bits?
>> If we need to deal with equal stamps from different contexts we could tie-break
>> based on ctx address or so, but seeing its a global counter from the class,
>> that shouldn't happen for now.
> Sounds good enough to me.
>
>>> +#ifdef CONFIG_DEBUG_MUTEXES
>>> +		DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
>>> +		ctx->contending_lock = ww;
>>> +#endif
>>> +		return -EDEADLK;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
>>> +						   struct ww_acquire_ctx *ww_ctx,
>>> +						   bool ww_slow)
>>> +{
>>> +#ifdef CONFIG_DEBUG_MUTEXES
>>> +	/*
>>> +	 * If this WARN_ON triggers, you used mutex_lock to acquire,
>>> +	 * but released with ww_mutex_unlock in this call.
>>> +	 */
>>> +	DEBUG_LOCKS_WARN_ON(ww->ctx);
>>> +
>>> +	/*
>>> +	 * Not quite done after ww_acquire_done() ?
>>> +	 */
>>> +	DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
>>> +
>>> +	if (ww_slow) {
>> s/ww_slow/!ww_ctx->acquired/
>>
>>> +		DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
>>> +		ww_ctx->contending_lock = NULL;
>>> +	} else
>>> +		DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
>>> +
>>> +
>>> +	/*
>>> +	 * Naughty, using a different class can lead to undefined behavior!
>>> +	 */
>>> +	DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
>>> +
>>> +	if (ww_slow)
>>> +		DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
>>> +
>>> +	ww_ctx->acquired++;
>>> +#endif
>>> +}
>>> +
>>> +/*
>>> + * after acquiring lock with fastpath or when we lost out in contested
>>> + * slowpath, set ctx and wake up any waiters so they can recheck.
>>> + *
>>> + * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
>>> + * as the fastpath and opportunistic spinning are disabled in that case.
>>> + */
>>> +static __always_inline void
>>> +ww_mutex_set_context_fastpath(struct ww_mutex *lock,
>>> +			       struct ww_acquire_ctx *ctx)
>>> +{
>>> +	unsigned long flags;
>>> +	struct mutex_waiter *cur;
>>> +
>>> +	ww_mutex_lock_acquired(lock, ctx, false);
>>> +
>>> +	lock->ctx = ctx;
>>  missing comment
> Yeah, this was patched up as per danvet's command, moved the smp_mb__after upwards, and added a full smp_mb after setting lock->ctx.
>>> +	smp_mb__after_atomic_dec();
>>> +
>>> +	/*
>>> +	 * Check if lock is contended, if not there is nobody to wake up
>>> +	 */
>>> +	if (likely(atomic_read(&lock->base.count) == 0))
>>> +		return;
>>> +
>>> +	/*
>>> +	 * Uh oh, we raced in fastpath, wake up everyone in this case,
>>> +	 * so they can see the new ctx
>>> +	 */
>>> +	spin_lock_mutex(&lock->base.wait_lock, flags);
>>> +	list_for_each_entry(cur, &lock->base.wait_list, list) {
>>> +		debug_mutex_wake_waiter(&lock->base, cur);
>>> +		wake_up_process(cur->task);
>>> +	}
>>> +	spin_unlock_mutex(&lock->base.wait_lock, flags);
>>> +}
>>> +
>>>  /*
>>>   * Lock a mutex (possibly interruptible), slowpath:
>>>   */
>>> -static inline int __sched
>>> +static __always_inline int __sched
>>>  __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
>>> -		    struct lockdep_map *nest_lock, unsigned long ip)
>>> +		    struct lockdep_map *nest_lock, unsigned long ip,
>>> +		    struct ww_acquire_ctx *ww_ctx, bool ww_slow)
>>>  {
>>>  	struct task_struct *task = current;
>>>  	struct mutex_waiter waiter;
>>>  	unsigned long flags;
>>> +	int ret;
>>>  
>>>  	preempt_disable();
>>>  	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
>>> @@ -163,6 +303,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
>>>  	for (;;) {
>>>  		struct task_struct *owner;
>>>  
>>> +		if (!__builtin_constant_p(ww_ctx == NULL) && !ww_slow) {
>> Since we _know_ ww_ctx isn't NULL, we can trivially do: s/ww_slow/!ww_ctx->acquired/
>>
>>> +			struct ww_mutex *ww;
>>> +
>>> +			ww = container_of(lock, struct ww_mutex, base);
>>> +			if (ACCESS_ONCE(ww->ctx))
>> What's the point of this ACCESS_ONCE()?
> Break out of the spin_on_owner loop. Without taking spin_lock_mutex there is no guarantee that the
> contents of ww->ctx are valid or sane in any way, so there's no way to check if we ought to back off or not.
>
>
>>> +				break;
>>> +		}
>>> +
>>>  		/*
>>>  		 * If there's an owner, wait for it to either
>>>  		 * release the lock or go to sleep.
>>> @@ -173,6 +321,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
>>>  
>>>  		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
>>>  			lock_acquired(&lock->dep_map, ip);
>> Should this not also have a __builtin_constant_p(ww_ctx == NULL) ?
> ww_slow should not be set to non-zero when ww_ctx == NULL.
>
>>> +			if (ww_slow) {
>>> +				struct ww_mutex *ww;
>>> +				ww = container_of(lock, struct ww_mutex, base);
>>> +
>>> +				ww_mutex_set_context_fastpath(ww, ww_ctx);
>>> +			}
>>> +
>>>  			mutex_set_owner(lock);
>>>  			preempt_enable();
>>>  			return 0;
>>> @@ -228,15 +383,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
>>>  		 * TASK_UNINTERRUPTIBLE case.)
>>>  		 */
>>>  		if (unlikely(signal_pending_state(state, task))) {
>>> -			mutex_remove_waiter(lock, &waiter,
>>> -					    task_thread_info(task));
>>> -			mutex_release(&lock->dep_map, 1, ip);
>>> -			spin_unlock_mutex(&lock->wait_lock, flags);
>>> +			ret = -EINTR;
>>> +			goto err;
>>> +		}
>>>  
>>> -			debug_mutex_free_waiter(&waiter);
>>> -			preempt_enable();
>>> -			return -EINTR;
>>> +		if (!__builtin_constant_p(ww_ctx == NULL) && !ww_slow) {
>>> +			ret = __mutex_lock_check_stamp(lock, ww_ctx);
>>> +			if (ret)
>>> +				goto err;
>>>  		}
>>> +
>>>  		__set_task_state(task, state);
>>>  
>>>  		/* didn't get the lock, go to sleep: */
>>> @@ -251,6 +407,30 @@ done:
>>>  	mutex_remove_waiter(lock, &waiter, current_thread_info());
>>>  	mutex_set_owner(lock);
>>>  
>>> +	if (!__builtin_constant_p(ww_ctx == NULL)) {
>>> +		struct ww_mutex *ww = container_of(lock,
>>> +						      struct ww_mutex,
>>> +						      base);
>>> +		struct mutex_waiter *cur;
>>> +
>>> +		/*
>>> +		 * This branch gets optimized out for the common case,
>>> +		 * and is only important for ww_mutex_lock.
>>> +		 */
>>> +
>>> +		ww_mutex_lock_acquired(ww, ww_ctx, ww_slow);
>>> +		ww->ctx = ww_ctx;
>>> +
>>> +		/*
>>> +		 * Give any possible sleeping processes the chance to wake up,
>>> +		 * so they can recheck if they have to back off.
>>> +		 */
>>> +		list_for_each_entry(cur, &lock->wait_list, list) {
>>> +			debug_mutex_wake_waiter(lock, cur);
>>> +			wake_up_process(cur->task);
>>> +		}
>>> +	}
>>> +
>>>  	/* set it to 0 if there are no waiters left: */
>>>  	if (likely(list_empty(&lock->wait_list)))
>>>  		atomic_set(&lock->count, 0);
>>> @@ -261,6 +441,14 @@ done:
>>>  	preempt_enable();
>>>  
>>>  	return 0;
>>> +
>>> +err:
>>> +	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
>>> +	spin_unlock_mutex(&lock->wait_lock, flags);
>>> +	debug_mutex_free_waiter(&waiter);
>>> +	mutex_release(&lock->dep_map, 1, ip);
>>> +	preempt_enable();
>>> +	return ret;
>>>  }
>>>  
>>>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>>> @@ -268,7 +456,8 @@ void __sched
>>>  mutex_lock_nested(struct mutex *lock, unsigned int subclass)
>>>  {
>>>  	might_sleep();
>>> -	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
>>> +	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
>>> +			    subclass, NULL, _RET_IP_, 0, 0);
>>>  }
>> The pendant in me has to tell you 4x that NULL != 0 :-)
>>
>>> +EXPORT_SYMBOL_GPL(ww_mutex_lock);
>>> +EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
>>> +EXPORT_SYMBOL_GPL(ww_mutex_lock_slow);
>>> +EXPORT_SYMBOL_GPL(ww_mutex_lock_slow_interruptible);
>> Now having to do the _slow stuff saves lines and interface complexity!
> It will also reduce useful debugging information returned a little. Danvet answered it better than me.
>
>>> @@ -401,20 +738,39 @@ __mutex_lock_slowpath(atomic_t *lock_count)
>>>  {
>>>  	struct mutex *lock = container_of(lock_count, struct mutex, count);
>>>  
>>> -	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
>>> +	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
>>> +			    NULL, _RET_IP_, 0, 0);
>>>  }
>>>  
>>>  static noinline int __sched
>>>  __mutex_lock_killable_slowpath(struct mutex *lock)
>>>  {
>>> -	return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
>>> +	return __mutex_lock_common(lock, TASK_KILLABLE, 0,
>>> +				   NULL, _RET_IP_, 0, 0);
>>>  }
>>>  
>>>  static noinline int __sched
>>>  __mutex_lock_interruptible_slowpath(struct mutex *lock)
>>>  {
>>> -	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
>>> +	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
>>> +				   NULL, _RET_IP_, 0, 0);
>>>  }
>> A few more cases where NULL != 0 :-)
> But yeah, so open questions..
>
> 1. Do you still want to get rid of the _single variants, even though doing so would slightly reduce debugging?
This is not as disastrous as I originally thought.
lock_single needs to be kept, unlock_single can be removed without consequences.
trylock_single could then be renamed to trylock, since it's no longer paired with unlock_single.

It will prevent unlock and unlock_single from being mismatched, because the same code would get executed.
Calling the mutex_unlock function directly on a ww_mutex must then be forbidden, as it's only an implementation detail.


> 2. Do you really want to drop the *_slow variants?
> Doing so might reduce debugging slightly. I like method #2 in ww-mutex-design.txt, it makes it very clear why you
> would handle the *_slow case differently anyway.
As you pointed out, we wouldn't lose much debugging information.
The same checks could be done in the normal variant with
WARN_ON(ctx->lock && ctx->lock != lock);
WARN_ON(ctx->lock && ctx->acquired > 0);

But it boils down to ww_mutex_lock_slow returning void instead of int __must_check from ww_mutex_lock.

Maybe add inlines for *_slow, that use the ww_mutex_lock functions, and check ctx->lock == lock in debugging mode?
> 3. is a smp_mb needed to serialize lock->ctx with the atomic_read?
>
> (mutex locked in fastpath, which is typically an atomic_dec operation)
> smp_mb__after_atomic_dec();
> lock->ctx = ..;
> smp_mb();
> if (atomic_read(lock->count) == 0) return;
>
> feels a bit like overkill to me.
>
> ~Maarten
Daniel Vetter May 23, 2013, 10:45 a.m. UTC | #10
On Thu, May 23, 2013 at 11:13 AM, Maarten Lankhorst
<maarten.lankhorst@canonical.com> wrote:
>> 2. Do you really want to drop the *_slow variants?
>> Doing so might reduce debugging slightly. I like method #2 in ww-mutex-design.txt, it makes it very clear why you
>> would handle the *_slow case differently anyway.
> As you pointed out, we wouldn't lose much debugging information.
> The same checks could be done in the normal variant with
> WARN_ON(ctx->lock && ctx->lock != lock);
> WARN_ON(ctx->lock && ctx->acquired > 0);

s/lock/contending_lock/ I guess. But yeah, I should have more
carefully read Peter's suggestion to fold in some of the ww_slow debug
checks, we can indeed keep the important debug checks even when
dropping slow. Silly me should be less sloppy.

> But it boils down to ww_mutex_lock_slow returning void instead of int __must_check from ww_mutex_lock.
>
> Maybe add inlines for *_slow, that use the ww_mutex_lock functions, and check ctx->lock == lock in debugging mode?

So either we keep the _slow versions or drop the __must_check for
ww_mutex_lock. In both cases the ww mutex user needs to think a bit
what to do, and I don't there's much we can do in the implementation
(beside all the existing debug support we have) to help. So now I'm
leaning more towards dropping the _slow variants to avoid interface
proliferation.
-Daniel
--
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
Peter Zijlstra May 27, 2013, 8 a.m. UTC | #11
On Wed, May 22, 2013 at 07:24:38PM +0200, Maarten Lankhorst wrote:
> >> +- Functions to only acquire a single w/w mutex, which results in the exact same
> >> +  semantics as a normal mutex. These functions have the _single postfix.
> > This is missing rationale.

> trylock_single is useful when iterating over a list, and you want to evict a bo, but only the first one that can be acquired.
> lock_single is useful when only a single bo needs to be acquired, for example to lock a buffer during mmap.

OK, so given that its still early, monday and I haven't actually spend
much time thinking on this; would it be possible to make:
ww_mutex_lock(.ctx=NULL) act like ww_mutex_lock_single()?

The idea is that if we don't provide a ctx, we'll get a different
lockdep annotation; mutex_lock() vs mutex_lock_nest_lock(). So if we
then go and make a mistake, lockdep should warn us.

Would that work or should I stock up on morning juice?
Peter Zijlstra May 27, 2013, 8:21 a.m. UTC | #12
On Wed, May 22, 2013 at 07:24:38PM +0200, Maarten Lankhorst wrote:
> >> +static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
> >> +				   struct ww_class *ww_class)
> >> +{
> >> +	ctx->task = current;
> >> +	do {
> >> +		ctx->stamp = atomic_long_inc_return(&ww_class->stamp);
> >> +	} while (unlikely(!ctx->stamp));
> > I suppose we'll figure something out when this becomes a bottleneck. Ideally
> > we'd do something like:
> >
> >  ctx->stamp = local_clock();
> >
> > but for now we cannot guarantee that's not jiffies, and I suppose that's a tad
> > too coarse to work for this.
> This might mess up when 2 cores happen to return exactly the same time, how do you choose a winner in that case?
> EDIT: Using pointer address like you suggested below is fine with me. ctx pointer would be static enough.

Right, but for now I suppose the 'global' atomic is ok, if/when we find
it hurts performance we can revisit. I was just spewing ideas :-)

> > Also, why is 0 special?
> Oops, 0 is no longer special.
> 
> I used to set the samp directly on the lock, so 0 used to mean no ctx set.

Ah, ok :-)

> >> +static inline int __must_check ww_mutex_trylock_single(struct ww_mutex *lock)
> >> +{
> >> +	return mutex_trylock(&lock->base);
> >> +}
> > trylocks can never deadlock they don't block per definition, I don't see the
> > point of the _single() thing here.
> I called it single because they weren't annotated into any ctx. I can drop the _single suffix though,
> but you'd still need to unlock with unlock_single, or we need to remove that distinction altogether,
> lose a few lockdep checks and only have a one unlock function.

Again, early.. monday.. would a trylock, even if successful still need
the ctx?
Maarten Lankhorst May 27, 2013, 8:26 a.m. UTC | #13
Op 27-05-13 10:00, Peter Zijlstra schreef:
> On Wed, May 22, 2013 at 07:24:38PM +0200, Maarten Lankhorst wrote:
>>>> +- Functions to only acquire a single w/w mutex, which results in the exact same
>>>> +  semantics as a normal mutex. These functions have the _single postfix.
>>> This is missing rationale.
>> trylock_single is useful when iterating over a list, and you want to evict a bo, but only the first one that can be acquired.
>> lock_single is useful when only a single bo needs to be acquired, for example to lock a buffer during mmap.
> OK, so given that its still early, monday and I haven't actually spend
> much time thinking on this; would it be possible to make:
> ww_mutex_lock(.ctx=NULL) act like ww_mutex_lock_single()?
>
> The idea is that if we don't provide a ctx, we'll get a different
> lockdep annotation; mutex_lock() vs mutex_lock_nest_lock(). So if we
> then go and make a mistake, lockdep should warn us.
>
> Would that work or should I stock up on morning juice?
>
It's easy to merge unlock_single and unlock, which I did in the next version I'll post.
Lockdep will already warn if ww_mutex_lock and ww_mutex_lock_single are both
used. ww_test_block_context and ww_test_context_block in lib/locking-selftest.c
are the testcases for this.

The locking paths are too different, it will end up with doing "if (ctx == NULL) mutex_lock(); else ww_mutex_lock();"

~Maarten
Peter Zijlstra May 27, 2013, 8:29 a.m. UTC | #14
On Wed, May 22, 2013 at 06:49:04PM +0200, Daniel Vetter wrote:
> - _slow functions can check whether all acquire locks have been
> released and whether the caller is indeed blocking on the contending
> lock. Not doing so could either result in needless spinning instead of
> blocking (when blocking on the wrong lock) or in deadlocks (when not
> dropping all acquired).

We could add ww_mutex_assert_context_empty() or somesuch so that
paranoid people have a means of expressing themselves :-)
Peter Zijlstra May 27, 2013, 9:13 a.m. UTC | #15
On Mon, May 27, 2013 at 10:26:39AM +0200, Maarten Lankhorst wrote:
> Op 27-05-13 10:00, Peter Zijlstra schreef:
> > On Wed, May 22, 2013 at 07:24:38PM +0200, Maarten Lankhorst wrote:
> >>>> +- Functions to only acquire a single w/w mutex, which results in the exact same
> >>>> +  semantics as a normal mutex. These functions have the _single postfix.
> >>> This is missing rationale.
> >> trylock_single is useful when iterating over a list, and you want to evict a bo, but only the first one that can be acquired.
> >> lock_single is useful when only a single bo needs to be acquired, for example to lock a buffer during mmap.
> > OK, so given that its still early, monday and I haven't actually spend
> > much time thinking on this; would it be possible to make:
> > ww_mutex_lock(.ctx=NULL) act like ww_mutex_lock_single()?
> >
> > The idea is that if we don't provide a ctx, we'll get a different
> > lockdep annotation; mutex_lock() vs mutex_lock_nest_lock(). So if we
> > then go and make a mistake, lockdep should warn us.
> >
> > Would that work or should I stock up on morning juice?
> >
> It's easy to merge unlock_single and unlock, which I did in the next version I'll post.
> Lockdep will already warn if ww_mutex_lock and ww_mutex_lock_single are both
> used. ww_test_block_context and ww_test_context_block in lib/locking-selftest.c
> are the testcases for this.
> 
> The locking paths are too different, it will end up with doing "if (ctx == NULL) mutex_lock(); else ww_mutex_lock();"

I was more thinking like:

int __sched ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
	might_sleep();
	return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
				   ctx ? ctx->dep_map : NULL, _RET_IP_,
				   ctx, 0);
}

That should make ww_mutex_lock(.ctx=NULL) equivalent to
mutex_lock(&lock->base), no?

Anyway, implementation aside, it would again reduce the interface some.
Maarten Lankhorst May 27, 2013, 9:58 a.m. UTC | #16
Op 27-05-13 11:13, Peter Zijlstra schreef:
> On Mon, May 27, 2013 at 10:26:39AM +0200, Maarten Lankhorst wrote:
>> Op 27-05-13 10:00, Peter Zijlstra schreef:
>>> On Wed, May 22, 2013 at 07:24:38PM +0200, Maarten Lankhorst wrote:
>>>>>> +- Functions to only acquire a single w/w mutex, which results in the exact same
>>>>>> +  semantics as a normal mutex. These functions have the _single postfix.
>>>>> This is missing rationale.
>>>> trylock_single is useful when iterating over a list, and you want to evict a bo, but only the first one that can be acquired.
>>>> lock_single is useful when only a single bo needs to be acquired, for example to lock a buffer during mmap.
>>> OK, so given that its still early, monday and I haven't actually spend
>>> much time thinking on this; would it be possible to make:
>>> ww_mutex_lock(.ctx=NULL) act like ww_mutex_lock_single()?
>>>
>>> The idea is that if we don't provide a ctx, we'll get a different
>>> lockdep annotation; mutex_lock() vs mutex_lock_nest_lock(). So if we
>>> then go and make a mistake, lockdep should warn us.
>>>
>>> Would that work or should I stock up on morning juice?
>>>
>> It's easy to merge unlock_single and unlock, which I did in the next version I'll post.
>> Lockdep will already warn if ww_mutex_lock and ww_mutex_lock_single are both
>> used. ww_test_block_context and ww_test_context_block in lib/locking-selftest.c
>> are the testcases for this.
>>
>> The locking paths are too different, it will end up with doing "if (ctx == NULL) mutex_lock(); else ww_mutex_lock();"
> I was more thinking like:
>
> int __sched ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
> {
> 	might_sleep();
> 	return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
> 				   ctx ? ctx->dep_map : NULL, _RET_IP_,
> 				   ctx, 0);
> }
>
> That should make ww_mutex_lock(.ctx=NULL) equivalent to
> mutex_lock(&lock->base), no?
>
> Anyway, implementation aside, it would again reduce the interface some.
>
It doesn't work like that. __builtin_constant_p(ctx == NULL) will evaluate to false in __mutex_lock_common, even if you call ww_mutex_lock(lock, NULL);
gcc cannot prove at compile time whether ctx == NULL is true or false for the __mutex_lock_common inlining here, so __builtin_constant_p() will return false.

And again, that's just saying

ww_mutex_lock() {
if (ctx)
original ww_mutex_lock's slowpath(lock, ctx);
else
mutex_lock's slowpath(lock->base);
}

And the next version will already remove unlock_single, and this is the implementation for lock_single currently:
static inline void ww_mutex_lock_single(struct ww_mutex *lock)
{
    mutex_lock(&lock->base);
}

So why do you want to merge it?

~Maarten
Maarten Lankhorst May 27, 2013, 10:01 a.m. UTC | #17
Op 27-05-13 10:21, Peter Zijlstra schreef:
> On Wed, May 22, 2013 at 07:24:38PM +0200, Maarten Lankhorst wrote:
>>>> +static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
>>>> +				   struct ww_class *ww_class)
>>>> +{
>>>> +	ctx->task = current;
>>>> +	do {
>>>> +		ctx->stamp = atomic_long_inc_return(&ww_class->stamp);
>>>> +	} while (unlikely(!ctx->stamp));
>>> I suppose we'll figure something out when this becomes a bottleneck. Ideally
>>> we'd do something like:
>>>
>>>  ctx->stamp = local_clock();
>>>
>>> but for now we cannot guarantee that's not jiffies, and I suppose that's a tad
>>> too coarse to work for this.
>> This might mess up when 2 cores happen to return exactly the same time, how do you choose a winner in that case?
>> EDIT: Using pointer address like you suggested below is fine with me. ctx pointer would be static enough.
> Right, but for now I suppose the 'global' atomic is ok, if/when we find
> it hurts performance we can revisit. I was just spewing ideas :-)
If  accurate timers are available it wouldn't be a bad idea. I fixed up the code to at least support this case should it happen.
For now the source of the stamp is still a single atomic_long.

>>> Also, why is 0 special?
>> Oops, 0 is no longer special.
>>
>> I used to set the samp directly on the lock, so 0 used to mean no ctx set.
> Ah, ok :-)
>
>>>> +static inline int __must_check ww_mutex_trylock_single(struct ww_mutex *lock)
>>>> +{
>>>> +	return mutex_trylock(&lock->base);
>>>> +}
>>> trylocks can never deadlock they don't block per definition, I don't see the
>>> point of the _single() thing here.
>> I called it single because they weren't annotated into any ctx. I can drop the _single suffix though,
>> but you'd still need to unlock with unlock_single, or we need to remove that distinction altogether,
>> lose a few lockdep checks and only have a one unlock function.
> Again, early.. monday.. would a trylock, even if successful still need
> the ctx?
No ctx for trylock is supported. You can still do a trylock while holding a context, but the mutex won't be
a part of the context. Normal lockdep rules apply. lib/locking-selftest.c:

context + ww_mutex_lock first, then a trylock:
dotest(ww_test_context_try, SUCCESS, LOCKTYPE_WW);

trylock first, then context + ww_mutex_lock:
dotest(ww_test_try_context, FAILURE, LOCKTYPE_WW);

For now I don't want to add support for a trylock with context, I'm very glad I managed to fix ttm locking
to not require this any more, and it was needed there only because it was a workaround for the locking
being wrong.  There was no annotation for the buffer locking it was using, so the real problem wasn't easy to spot.

~Maarten
Peter Zijlstra May 27, 2013, 10:24 a.m. UTC | #18
On Mon, May 27, 2013 at 12:01:50PM +0200, Maarten Lankhorst wrote:
> > Again, early.. monday.. would a trylock, even if successful still need
> > the ctx?
> No ctx for trylock is supported. You can still do a trylock while
> holding a context, but the mutex won't be a part of the context.
> Normal lockdep rules apply. lib/locking-selftest.c:
> 
> context + ww_mutex_lock first, then a trylock:
> dotest(ww_test_context_try, SUCCESS, LOCKTYPE_WW);
> 
> trylock first, then context + ww_mutex_lock:
> dotest(ww_test_try_context, FAILURE, LOCKTYPE_WW);
> 
> For now I don't want to add support for a trylock with context, I'm
> very glad I managed to fix ttm locking to not require this any more,
> and it was needed there only because it was a workaround for the
> locking being wrong.  There was no annotation for the buffer locking
> it was using, so the real problem wasn't easy to spot.

Ah, ok. 

My question really was whether there even was sense for a trylock with
context. I couldn't come up with a case for it; but I think I see one
now.

The thing is; if there could exist something like:

  ww_mutex_trylock(struct ww_mutex *, struct ww_acquire_ctx *ctx);

Then we should not now take away that name and make it mean something
else; namely: ww_mutex_trylock_single().

Unless we want to allow .ctx=NULL to mean _single.

As to why I proposed that (.ctx=NULL meaning _single); I suppose because
I'm a minimalist at heart.
Maarten Lankhorst May 27, 2013, 10:52 a.m. UTC | #19
Op 27-05-13 12:24, Peter Zijlstra schreef:
> On Mon, May 27, 2013 at 12:01:50PM +0200, Maarten Lankhorst wrote:
>>> Again, early.. monday.. would a trylock, even if successful still need
>>> the ctx?
>> No ctx for trylock is supported. You can still do a trylock while
>> holding a context, but the mutex won't be a part of the context.
>> Normal lockdep rules apply. lib/locking-selftest.c:
>>
>> context + ww_mutex_lock first, then a trylock:
>> dotest(ww_test_context_try, SUCCESS, LOCKTYPE_WW);
>>
>> trylock first, then context + ww_mutex_lock:
>> dotest(ww_test_try_context, FAILURE, LOCKTYPE_WW);
>>
>> For now I don't want to add support for a trylock with context, I'm
>> very glad I managed to fix ttm locking to not require this any more,
>> and it was needed there only because it was a workaround for the
>> locking being wrong.  There was no annotation for the buffer locking
>> it was using, so the real problem wasn't easy to spot.
> Ah, ok. 
>
> My question really was whether there even was sense for a trylock with
> context. I couldn't come up with a case for it; but I think I see one
> now.
The reason ttm needed it was because there was another lock that interacted
with the ctx lock in a weird way. The ww lock it was using was inverted with another
lock, so it had to grab that lock first, perform a trylock on the ww lock, and if that failed
unlock the lock, wait for it to be unlocked, then retry the same thing again.
I'm so glad I managed to fix that mess, if you really need ww_mutex_trylock with a ctx,
it's an indication your locking is wrong.

For ww_mutex_trylock with a context to be of any use you would also need to return
0 or a -errno, (-EDEADLK, -EBUSY (already locked by someone else), or -EALREADY).
This would make the trylock very different from other trylocks, and very confusing because
if (ww_mutex_trylock(lock, ctx)) would not do what you would think it would do.
> The thing is; if there could exist something like:
>
>   ww_mutex_trylock(struct ww_mutex *, struct ww_acquire_ctx *ctx);
>
> Then we should not now take away that name and make it mean something
> else; namely: ww_mutex_trylock_single().
>
> Unless we want to allow .ctx=NULL to mean _single.
>
> As to why I proposed that (.ctx=NULL meaning _single); I suppose because
> I'm a minimalist at heart.
Minimalism isn't bad, it's just knowing when to sto
Peter Zijlstra May 27, 2013, 11:15 a.m. UTC | #20
On Mon, May 27, 2013 at 12:52:00PM +0200, Maarten Lankhorst wrote:
> The reason ttm needed it was because there was another lock that interacted
> with the ctx lock in a weird way. The ww lock it was using was inverted with another
> lock, so it had to grab that lock first, perform a trylock on the ww lock, and if that failed
> unlock the lock, wait for it to be unlocked, then retry the same thing again.
> I'm so glad I managed to fix that mess, if you really need ww_mutex_trylock with a ctx,
> it's an indication your locking is wrong.
> 
> For ww_mutex_trylock with a context to be of any use you would also need to return
> 0 or a -errno, (-EDEADLK, -EBUSY (already locked by someone else), or -EALREADY).
> This would make the trylock very different from other trylocks, and very confusing because
> if (ww_mutex_trylock(lock, ctx)) would not do what you would think it would do.

Yuck ;-)

Anyway, what I was thinking of is something like:

	T0		T1

	try A
			lock B
	lock B
			lock A

Now, if for some reason T1 won the lottery such that T0 would have to be
wounded, T0's context would indicate its the first entry and not return
-EDEADLK.

OTOH, anybody doing creative things like that might well deserve
whatever they get ;-)

> > The thing is; if there could exist something like:
> >
> >   ww_mutex_trylock(struct ww_mutex *, struct ww_acquire_ctx *ctx);
> >
> > Then we should not now take away that name and make it mean something
> > else; namely: ww_mutex_trylock_single().
> >
> > Unless we want to allow .ctx=NULL to mean _single.
> >
> > As to why I proposed that (.ctx=NULL meaning _single); I suppose because
> > I'm a minimalist at heart.
> Minimalism isn't bad, it's just knowing when to sto

:-)
Maarten Lankhorst May 27, 2013, 11:24 a.m. UTC | #21
Op 27-05-13 13:15, Peter Zijlstra schreef:
> On Mon, May 27, 2013 at 12:52:00PM +0200, Maarten Lankhorst wrote:
>> The reason ttm needed it was because there was another lock that interacted
>> with the ctx lock in a weird way. The ww lock it was using was inverted with another
>> lock, so it had to grab that lock first, perform a trylock on the ww lock, and if that failed
>> unlock the lock, wait for it to be unlocked, then retry the same thing again.
>> I'm so glad I managed to fix that mess, if you really need ww_mutex_trylock with a ctx,
>> it's an indication your locking is wrong.
>>
>> For ww_mutex_trylock with a context to be of any use you would also need to return
>> 0 or a -errno, (-EDEADLK, -EBUSY (already locked by someone else), or -EALREADY).
>> This would make the trylock very different from other trylocks, and very confusing because
>> if (ww_mutex_trylock(lock, ctx)) would not do what you would think it would do.
> Yuck ;-)
>
> Anyway, what I was thinking of is something like:
>
> 	T0		T1
>
> 	try A
> 			lock B
> 	lock B
> 			lock A
>
> Now, if for some reason T1 won the lottery such that T0 would have to be
> wounded, T0's context would indicate its the first entry and not return
> -EDEADLK.
And this sounds like something lockdep is designed to complain about.

Nothing stops you from doing try A then doing try B, which would be the correct way to deal with this situation.
Why would you trylock one, and then not do the same for another?

> OTOH, anybody doing creative things like that might well deserve
> whatever they get ;-)
Indeed!

>>> The thing is; if there could exist something like:
>>>
>>>   ww_mutex_trylock(struct ww_mutex *, struct ww_acquire_ctx *ctx);
>>>
>>> Then we should not now take away that name and make it mean something
>>> else; namely: ww_mutex_trylock_single().
>>>
>>> Unless we want to allow .ctx=NULL to mean _single.
>>>
>>> As to why I proposed that (.ctx=NULL meaning _single); I suppose because
>>> I'm a minimalist at heart.
>> Minimalism isn't bad, it's just knowing when to sto
> :-)
>
Daniel Vetter May 27, 2013, 2:47 p.m. UTC | #22
On Mon, May 27, 2013 at 10:21 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, May 22, 2013 at 07:24:38PM +0200, Maarten Lankhorst wrote:
>> >> +static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
>> >> +                             struct ww_class *ww_class)
>> >> +{
>> >> +  ctx->task = current;
>> >> +  do {
>> >> +          ctx->stamp = atomic_long_inc_return(&ww_class->stamp);
>> >> +  } while (unlikely(!ctx->stamp));
>> > I suppose we'll figure something out when this becomes a bottleneck. Ideally
>> > we'd do something like:
>> >
>> >  ctx->stamp = local_clock();
>> >
>> > but for now we cannot guarantee that's not jiffies, and I suppose that's a tad
>> > too coarse to work for this.
>> This might mess up when 2 cores happen to return exactly the same time, how do you choose a winner in that case?
>> EDIT: Using pointer address like you suggested below is fine with me. ctx pointer would be static enough.
>
> Right, but for now I suppose the 'global' atomic is ok, if/when we find
> it hurts performance we can revisit. I was just spewing ideas :-)

We could do a simple

ctx->stamp = (local_clock() << nr_cpu_shift) | local_processor_id()

to work around any bad luck in grabbing the ticket. With sufficient
fine clocks the bias towards smaller cpu ids would be rather
irrelevant. Just wanted to drop this idea before I'll forget about it
again ;-)
-Daniel
--
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
Daniel Vetter May 27, 2013, 2:55 p.m. UTC | #23
On Mon, May 27, 2013 at 4:47 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Mon, May 27, 2013 at 10:21 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>> On Wed, May 22, 2013 at 07:24:38PM +0200, Maarten Lankhorst wrote:
>>> >> +static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
>>> >> +                             struct ww_class *ww_class)
>>> >> +{
>>> >> +  ctx->task = current;
>>> >> +  do {
>>> >> +          ctx->stamp = atomic_long_inc_return(&ww_class->stamp);
>>> >> +  } while (unlikely(!ctx->stamp));
>>> > I suppose we'll figure something out when this becomes a bottleneck. Ideally
>>> > we'd do something like:
>>> >
>>> >  ctx->stamp = local_clock();
>>> >
>>> > but for now we cannot guarantee that's not jiffies, and I suppose that's a tad
>>> > too coarse to work for this.
>>> This might mess up when 2 cores happen to return exactly the same time, how do you choose a winner in that case?
>>> EDIT: Using pointer address like you suggested below is fine with me. ctx pointer would be static enough.
>>
>> Right, but for now I suppose the 'global' atomic is ok, if/when we find
>> it hurts performance we can revisit. I was just spewing ideas :-)
>
> We could do a simple
>
> ctx->stamp = (local_clock() << nr_cpu_shift) | local_processor_id()
>
> to work around any bad luck in grabbing the ticket. With sufficient
> fine clocks the bias towards smaller cpu ids would be rather
> irrelevant. Just wanted to drop this idea before I'll forget about it
> again ;-)
Not a good idea to throw around random ideas right after a work-out.
This is broken since different threads could end up with the same low
bits. Comparing ctx pointers otoh on top of the timestamp should work.
-Daniel
--
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
diff mbox

Patch

diff --git a/Documentation/ww-mutex-design.txt b/Documentation/ww-mutex-design.txt
new file mode 100644
index 0000000..154bae3
--- /dev/null
+++ b/Documentation/ww-mutex-design.txt
@@ -0,0 +1,322 @@ 
+Wait/Wound Deadlock-Proof Mutex Design
+======================================
+
+Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
+
+Motivation for WW-Mutexes
+-------------------------
+
+GPU's do operations that commonly involve many buffers.  Those buffers
+can be shared across contexts/processes, exist in different memory
+domains (for example VRAM vs system memory), and so on.  And with
+PRIME / dmabuf, they can even be shared across devices.  So there are
+a handful of situations where the driver needs to wait for buffers to
+become ready.  If you think about this in terms of waiting on a buffer
+mutex for it to become available, this presents a problem because
+there is no way to guarantee that buffers appear in a execbuf/batch in
+the same order in all contexts.  That is directly under control of
+userspace, and a result of the sequence of GL calls that an application
+makes.	Which results in the potential for deadlock.  The problem gets
+more complex when you consider that the kernel may need to migrate the
+buffer(s) into VRAM before the GPU operates on the buffer(s), which
+may in turn require evicting some other buffers (and you don't want to
+evict other buffers which are already queued up to the GPU), but for a
+simplified understanding of the problem you can ignore this.
+
+The algorithm that TTM came up with for dealing with this problem is quite
+simple.  For each group of buffers (execbuf) that need to be locked, the caller
+would be assigned a unique reservation id/ticket, from a global counter.  In
+case of deadlock while locking all the buffers associated with a execbuf, the
+one with the lowest reservation ticket (i.e. the oldest task) wins, and the one
+with the higher reservation id (i.e. the younger task) unlocks all of the
+buffers that it has already locked, and then tries again.
+
+In the RDBMS literature this deadlock handling approach is called wait/wound:
+The older tasks waits until it can acquire the contended lock. The younger tasks
+needs to back off and drop all the locks it is currently holding, i.e. the
+younger task is wounded.
+
+Concepts
+--------
+
+Compared to normal mutexes two additional concepts/objects show up in the lock
+interface for w/w mutexes:
+
+Acquire context: To ensure eventual forward progress it is important the a task
+trying to acquire locks doesn't grab a new reservation id, but keeps the one it
+acquired when starting the lock acquisition. This ticket is stored in the
+acquire context. Furthermore the acquire context keeps track of debugging state
+to catch w/w mutex interface abuse.
+
+W/w class: In contrast to normal mutexes the lock class needs to be explicit for
+w/w mutexes, since it is required to initialize the acquire context.
+
+Furthermore there are three different classe of w/w lock acquire functions:
+- Normal lock acquisition with a context, using ww_mutex_lock
+- Slowpath lock acquisition on the contending lock, used by the wounded task
+  after having dropped all already acquired locks. These functions have the
+  _slow postfix.
+- Functions to only acquire a single w/w mutex, which results in the exact same
+  semantics as a normal mutex. These functions have the _single postfix.
+
+Of course, all the usual variants for handling wake-ups due to signals are also
+provided.
+
+Usage
+-----
+
+Three different ways to acquire locks within the same w/w class. Common
+definitions for methods 1&2.
+
+static DEFINE_WW_CLASS(ww_class);
+
+struct obj {
+	sct ww_mutex lock;
+	/* obj data */
+};
+
+struct obj_entry {
+	struct list_head *list;
+	struct obj *obj;
+};
+
+Method 1, using a list in execbuf->buffers that's not allowed to be reordered.
+This is useful if a list of required objects is already tracked somewhere.
+Furthermore the lock helper can use propagate the -EALREADY return code back to
+the caller as a signal that an object is twice on the list. This is useful if
+the list is constructed from userspace input and the ABI requires userspace to
+no have duplicate entries (e.g. for a gpu commandbuffer submission ioctl).
+
+int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+{
+	struct obj *res_obj = NULL;
+	struct obj_entry *contended_entry = NULL;
+	struct obj_entry *entry;
+
+	ww_acquire_init(ctx, &ww_class);
+
+retry:
+	list_for_each_entry (list, entry) {
+		if (entry == res_obj) {
+			res_obj = NULL;
+			continue;
+		}
+		ret = ww_mutex_lock(&entry->obj->lock, ctx);
+		if (ret < 0) {
+			contended_obj = entry;
+			goto err;
+		}
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+
+err:
+	list_for_each_entry_continue_reverse (list, contended_entry, entry)
+		ww_mutex_unlock(&entry->obj->lock);
+
+	if (res_obj)
+		ww_mutex_unlock(&res_obj->lock);
+
+	if (ret == -EDEADLK) {
+		/* we lost out in a seqno race, lock and retry.. */
+		ww_mutex_lock_slow(&contended_entry->obj->lock, ctx);
+		res_obj = contended_entry->obj;
+		goto retry;
+	}
+	ww_acquire_fini(ctx);
+
+	return ret;
+}
+
+Method 2, using a list in execbuf->buffers that can be reordered. Same semantics
+of duplicate entry detection using -EALREADY as method 1 above. But the
+list-reordering allows for a bit more idiomatic code.
+
+int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+{
+	struct obj_entry *entry, *entry2;
+
+	ww_acquire_init(ctx, &ww_class);
+
+	list_for_each_entry (list, entry) {
+		ret = ww_mutex_lock(&entry->obj->lock, ctx);
+		if (ret < 0) {
+			entry2 = entry;
+
+			list_for_each_entry_continue_reverse (list, entry2)
+				ww_mutex_unlock(&entry->obj->lock);
+
+			if (ret != -EDEADLK) {
+				ww_acquire_fini(ctx);
+				return ret;
+			}
+
+			/* we lost out in a seqno race, lock and retry.. */
+			ww_mutex_lock_slow(&entry->obj->lock, ctx);
+
+			/*
+			 * Move buf to head of the list, this will point
+			 * buf->next to the first unlocked entry,
+			 * restarting the for loop.
+			 */
+			list_del(&entry->list);
+			list_add(&entry->list, list);
+		}
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+}
+
+Unlocking works the same way for both methods 1&2:
+
+void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+{
+	struct obj_entry *entry;
+
+	list_for_each_entry (list, entry)
+		ww_mutex_unlock(&entry->obj->lock);
+
+	ww_acquire_fini(ctx);
+}
+
+Method 3 is useful if the list of objects is constructed ad-hoc and not upfront,
+e.g. when adjusting edges in a graph where each node has its own ww_mutex lock,
+and edges can only be changed when holding the locks of all involved nodes. w/w
+mutexes are a natural fit for such a case for two reasons:
+- They can handle lock-acquisition in any order which allows us to start walking
+  a graph from a starting point and then iteratively discovering new edges and
+  locking down the nodes those edges connect to.
+- Due to the -EALREADY return code signalling that a given objects is already
+  held there's no need for additional book-keeping to break cycles in the graph
+  or keep track off which looks are already held (when using more than one node
+  as a starting point).
+
+Note that this approach differs in two important ways from the above methods:
+- Since the list of objects is dynamically constructed (and might very well be
+  different when retrying due to hitting the -EDEADLK wound condition) there's
+  no need to keep any object on a persistent list when it's not locked. We can
+  therefore move the list_head into the object itself.
+- Otoh the dynamic object list construction also means that the -EALREADY return
+  code can't be propagated.
+
+Note also that methodes 1&2 and method 3 can be combined, e.g. to first lock a
+list of starting nodes (passed in from userspace) using one of the above
+methods. And then lock any additional objects affected by the operations using
+method 3 below. The backoff/retry procedure will be a bit more involved, since
+when the dynamic locking step hits -EDEADLK we also need to unlock all the
+objects acquired with the fixed list. But the w/w mutex debug checks will catch
+any interface misuse for these cases.
+
+Also, method 3 can't fail the lock acquisition step since it doesn't return
+-EALREADY. Of course this would be different when using the _interruptible
+variants, but that's outside of the scope of these examples here.
+
+struct obj {
+	struct ww_mutex ww_mutex;
+	struct list_head locked_list;
+};
+
+static DEFINE_WW_CLASS(ww_class);
+
+void __unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+{
+	struct obj entry;
+
+	for_each_safe (list, entry) {
+		/* need to do that before unlocking, since only the current lock holder is
+		allowed to use object */
+		list_del(entry->locked_list);
+		ww_mutex_unlock(entry->ww_mutex)
+	}
+}
+
+void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+{
+	struct list_head locked_buffers;
+	struct obj obj = NULL, entry;
+
+	ww_acquire_init(ctx, &ww_class);
+
+retry:
+	/* re-init loop start state */
+	loop {
+		/* magic code which walks over a graph and decides which objects
+		 * to lock */
+
+		ret = ww_mutex_lock(obj->ww_mutex, ctx);
+		if (ret == -EALREADY) {
+			/* we have that one already, get to the next object */
+			continue;
+		}
+		if (ret == -EDEADLK) {
+			__unlock_objs(list, ctx);
+
+			ww_mutex_lock_slow(obj);
+			list_add(locked_buffers, entry->locked_list);
+			goto retry;
+		}
+
+		/* locked a new object, add it to the list */
+		list_add(locked_buffers, entry->locked_list);
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+}
+
+void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+{
+	__unlock_objs(list, ctx);
+	ww_acquire_fini(ctx);
+}
+
+Method 4: Only lock one single objects. In that case deadlock detection and
+prevention is obviously overkill, since with grabbing just one lock you can't
+produce a deadlock within just one class. To simplify this case the w/w mutex
+api provides a set of lock/unlock_single functions which don't require an
+acquire context.
+
+Implementation Details
+----------------------
+
+Design:
+  ww_mutex currently encapsulates a struct mutex, this means no extra overhead for
+  normal mutex locks, which are far more common. As such there is only a small
+  increase in code size if wait/wound mutexes are not used.
+
+  In general, not much contention is expected. The locks are typically used to
+  serialize access to resources for devices. The only way to make wakeups
+  smarter would be at the cost of adding a field to struct mutex_waiter. This
+  would add overhead to all cases where normal mutexes are used, and
+  ww_mutexes are generally less performance sensitive.
+
+Lockdep:
+  Special care has been taken to warn for as many cases of api abuse
+  as possible. Some common api abuses will be caught with
+  CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended.
+
+  Some of the errors which will be warned about:
+   - Forgetting to call ww_acquire_fini or ww_acquire_init.
+   - Attempting to lock more mutexes after ww_acquire_done.
+   - Attempting to lock more mutexes after -EDEADLK,
+     before calling ww_mutex_lock_slow.
+
+   - Calling ww_mutex_lock_slow with while still holding some mutexes.
+   - Calling ww_mutex_lock_slow on the wrong mutex,
+     or before -EDEADLK was returned.
+
+   - Unlocking mutexes with the wrong unlock function.
+   - Calling one of the ww_acquire_* twice on the same context.
+   - Using a different ww_class for the mutex than for the ww_acquire_ctx.
+   - Normal lockdep errors that can result in deadlocks.
+
+  Some of the lockdep errors that can result in deadlocks:
+   - Calling ww_acquire_init to initialize a second ww_acquire_ctx before
+     having called ww_acquire_fini on the first.
+   - Mixing ww_mutex_lock and ww_mutex_lock_single.
+   - 'normal' deadlocks that can occur.
+
+FIXME: Update this section once we have the TASK_DEADLOCK task state flag magic
+implemented.
diff --git a/include/linux/mutex-debug.h b/include/linux/mutex-debug.h
index 731d77d..4ac8b19 100644
--- a/include/linux/mutex-debug.h
+++ b/include/linux/mutex-debug.h
@@ -3,6 +3,7 @@ 
 
 #include <linux/linkage.h>
 #include <linux/lockdep.h>
+#include <linux/debug_locks.h>
 
 /*
  * Mutexes - debugging helpers:
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 9121595..004f863 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -74,6 +74,35 @@  struct mutex_waiter {
 #endif
 };
 
+struct ww_class {
+	atomic_long_t stamp;
+	struct lock_class_key acquire_key;
+	struct lock_class_key mutex_key;
+	const char *acquire_name;
+	const char *mutex_name;
+};
+
+struct ww_acquire_ctx {
+	struct task_struct *task;
+	unsigned long stamp;
+#ifdef CONFIG_DEBUG_MUTEXES
+	unsigned acquired, done_acquire;
+	struct ww_class *ww_class;
+	struct ww_mutex *contending_lock;
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
+};
+
+struct ww_mutex {
+	struct mutex base;
+	struct ww_acquire_ctx *ctx;
+#ifdef CONFIG_DEBUG_MUTEXES
+	struct ww_class *ww_class;
+#endif
+};
+
 #ifdef CONFIG_DEBUG_MUTEXES
 # include <linux/mutex-debug.h>
 #else
@@ -98,8 +127,11 @@  static inline void mutex_destroy(struct mutex *lock) {}
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
 		, .dep_map = { .name = #lockname }
+# define __WW_CLASS_MUTEX_INITIALIZER(lockname, ww_class) \
+		, .ww_class = &ww_class
 #else
 # define __DEP_MAP_MUTEX_INITIALIZER(lockname)
+# define __WW_CLASS_MUTEX_INITIALIZER(lockname, ww_class)
 #endif
 
 #define __MUTEX_INITIALIZER(lockname) \
@@ -109,13 +141,49 @@  static inline void mutex_destroy(struct mutex *lock) {}
 		__DEBUG_MUTEX_INITIALIZER(lockname) \
 		__DEP_MAP_MUTEX_INITIALIZER(lockname) }
 
+#define __WW_CLASS_INITIALIZER(ww_class) \
+		{ .stamp = ATOMIC_LONG_INIT(0) \
+		, .acquire_name = #ww_class "_acquire" \
+		, .mutex_name = #ww_class "_mutex" }
+
+#define __WW_MUTEX_INITIALIZER(lockname, class) \
+		{ .base = { \__MUTEX_INITIALIZER(lockname) } \
+		__WW_CLASS_MUTEX_INITIALIZER(lockname, class) }
+
 #define DEFINE_MUTEX(mutexname) \
 	struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
 
+#define DEFINE_WW_CLASS(classname) \
+	struct ww_class classname = __WW_CLASS_INITIALIZER(classname)
+
+#define DEFINE_WW_MUTEX(mutexname, ww_class) \
+	struct ww_mutex mutexname = __WW_MUTEX_INITIALIZER(mutexname, ww_class)
+
+
 extern void __mutex_init(struct mutex *lock, const char *name,
 			 struct lock_class_key *key);
 
 /**
+ * ww_mutex_init - initialize the w/w mutex
+ * @lock: the mutex to be initialized
+ * @ww_class: the w/w class the mutex should belong to
+ *
+ * Initialize the w/w mutex to unlocked state and associate it with the given
+ * class.
+ *
+ * It is not allowed to initialize an already locked mutex.
+ */
+static inline void ww_mutex_init(struct ww_mutex *lock,
+				 struct ww_class *ww_class)
+{
+	__mutex_init(&lock->base, ww_class->mutex_name, &ww_class->mutex_key);
+	lock->ctx = NULL;
+#ifdef CONFIG_DEBUG_MUTEXES
+	lock->ww_class = ww_class;
+#endif
+}
+
+/**
  * mutex_is_locked - is the mutex locked
  * @lock: the mutex to be queried
  *
@@ -133,6 +201,7 @@  static inline int mutex_is_locked(struct mutex *lock)
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
 extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
+
 extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock,
 					unsigned int subclass);
 extern int __must_check mutex_lock_killable_nested(struct mutex *lock,
@@ -144,7 +213,7 @@  extern int __must_check mutex_lock_killable_nested(struct mutex *lock,
 
 #define mutex_lock_nest_lock(lock, nest_lock)				\
 do {									\
-	typecheck(struct lockdep_map *, &(nest_lock)->dep_map);		\
+	typecheck(struct lockdep_map *, &(nest_lock)->dep_map);	\
 	_mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);		\
 } while (0)
 
@@ -167,6 +236,192 @@  extern int __must_check mutex_lock_killable(struct mutex *lock);
  */
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
+
+/**
+ * ww_acquire_init - initialize a w/w acquire context
+ * @ctx: w/w acquire context to initialize
+ * @ww_class: w/w class of the context
+ *
+ * Initializes an context to acquire multiple mutexes of the given w/w class.
+ *
+ * Context-based w/w mutex acquiring can be done in any order whatsoever within
+ * a given lock class. Deadlocks will be detected and handled with the
+ * wait/wound logic.
+ *
+ * Mixing of context-based w/w mutex acquiring and single w/w mutex locking can
+ * result in undetected deadlocks and is so forbidden. Mixing different contexts
+ * for the same w/w class when acquiring mutexes can also result in undetected
+ * deadlocks, and is hence also forbidden.
+ *
+ * Nesting of acquire contexts for _different_ w/w classes is possible, subject
+ * to the usual locking rules between different lock classes.
+ *
+ * An acquire context must be release by the same task before the memory is
+ * freed with ww_acquire_fini. It is recommended to allocate the context itself
+ * on the stack.
+ */
+static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
+				   struct ww_class *ww_class)
+{
+	ctx->task = current;
+	do {
+		ctx->stamp = atomic_long_inc_return(&ww_class->stamp);
+	} while (unlikely(!ctx->stamp));
+#ifdef CONFIG_DEBUG_MUTEXES
+	ctx->ww_class = ww_class;
+	ctx->acquired = ctx->done_acquire = 0;
+	ctx->contending_lock = NULL;
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *)ctx, sizeof(*ctx));
+	lockdep_init_map(&ctx->dep_map, ww_class->acquire_name,
+			 &ww_class->acquire_key, 0);
+	mutex_acquire(&ctx->dep_map, 0, 0, _RET_IP_);
+#endif
+}
+
+/**
+ * ww_acquire_done - marks the end of the acquire phase
+ * @ctx: the acquire context
+ *
+ * Marks the end of the acquire phase, any further w/w mutex lock calls using
+ * this context are forbidden.
+ *
+ * Calling this function is optional, it is just useful to document w/w mutex
+ * code and clearly designated the acquire phase from actually using the locked
+ * data structures.
+ */
+static inline void ww_acquire_done(struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+	lockdep_assert_held(ctx);
+
+	DEBUG_LOCKS_WARN_ON(ctx->done_acquire);
+	ctx->done_acquire = 1;
+#endif
+}
+
+/**
+ * ww_acquire_fini - releases a w/w acquire context
+ * @ctx: the acquire context to free
+ *
+ * Releases a w/w acquire context. This must be called _after_ all acquired w/w
+ * mutexes have been released with ww_mutex_unlock.
+ */
+static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+	mutex_release(&ctx->dep_map, 0, _THIS_IP_);
+
+	DEBUG_LOCKS_WARN_ON(ctx->acquired);
+	if (!config_enabled(CONFIG_PROVE_LOCKING))
+		/*
+		 * lockdep will normally handle this,
+		 * but fail without anyway
+		 */
+		ctx->done_acquire = 1;
+
+	if (!config_enabled(CONFIG_DEBUG_LOCK_ALLOC))
+		/* ensure ww_acquire_fini will still fail if called twice */
+		ctx->acquired = ~0U;
+#endif
+}
+
+extern int __must_check ww_mutex_lock(struct ww_mutex *lock,
+				      struct ww_acquire_ctx *ctx);
+extern int __must_check ww_mutex_lock_interruptible(struct ww_mutex *,
+						    struct ww_acquire_ctx *ctx);
+
+extern void ww_mutex_lock_slow(struct ww_mutex *lock, struct ww_acquire_ctx *ctx);
+extern int __must_check ww_mutex_lock_slow_interruptible(struct ww_mutex *,
+							 struct ww_acquire_ctx *ctx);
+
+extern void ww_mutex_unlock(struct ww_mutex *lock);
+
+/**
+ * ww_mutex_trylock_single - tries to acquire the w/w mutex without acquire context
+ * @lock: mutex to lock
+ *
+ * Trylocks a mutex without acquire context, so no deadlock detection is
+ * possible. Returns 0 if the mutex has been acquired.
+ *
+ * Unlocking the mutex must happen with a call to ww_mutex_unlock_single.
+ */
+static inline int __must_check ww_mutex_trylock_single(struct ww_mutex *lock)
+{
+	return mutex_trylock(&lock->base);
+}
+
+/**
+ * ww_mutex_lock_single - acquire the w/w mutex without acquire context
+ * @lock: mutex to lock
+ *
+ * Locks a mutex without acquire context, so no deadlock detection is
+ * possible.
+ *
+ * Unlocking the mutex must happen with a call to ww_mutex_unlock_single.
+ */
+static inline void ww_mutex_lock_single(struct ww_mutex *lock)
+{
+	mutex_lock(&lock->base);
+}
+
+/**
+ * ww_mutex_lock_single_interruptible - acquire the w/w mutex without acquire
+ * 					context, interruptible
+ * @lock: mutex to lock
+ *
+ * Locks a mutex without acquire context, so no deadlock detection is
+ * possible.If a signal arrives while waiting for the lock then this function
+ * returns -EINTR. Returns 0 if the mutex has been acquired.
+ *
+ * Unlocking the mutex must happen with a call to ww_mutex_unlock_single.
+ */
+static inline int __must_check ww_mutex_lock_single_interruptible(struct ww_mutex *lock)
+{
+	return mutex_lock_interruptible(&lock->base);
+}
+
+/**
+ * ww_mutex_unlock_single - release the w/w mutex without acquire context
+ * @lock: mutex to unlock
+ *
+ * Unlock a w/w mutex that has been locked by this task previously without an
+ * acquire context. If the w/w mutex has been acquired with a context, it must
+ * be released with ww_mutex_unlock.
+ *
+ * This function must not be used in interrupt context. Unlocking
+ * of a not locked mutex is not allowed.
+ */
+static inline void ww_mutex_unlock_single(struct ww_mutex *lock)
+{
+	mutex_unlock(&lock->base);
+}
+
+/***
+ * ww_mutex_destroy - mark a w/w mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+static inline void ww_mutex_destroy(struct ww_mutex *lock)
+{
+	mutex_destroy(&lock->base);
+}
+
+/**
+ * ww_mutex_is_locked - is the w/w mutex locked
+ * @lock: the mutex to be queried
+ *
+ * Returns 1 if the mutex is locked, 0 if unlocked.
+ */
+static inline bool ww_mutex_is_locked(struct ww_mutex *lock)
+{
+	return mutex_is_locked(&lock->base);
+}
+
 extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
 #ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 84a5f07..66807c7 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -127,16 +127,156 @@  void __sched mutex_unlock(struct mutex *lock)
 
 EXPORT_SYMBOL(mutex_unlock);
 
+/**
+ * ww_mutex_unlock - release the w/w mutex
+ * @lock: the mutex to be released
+ *
+ * Unlock a mutex that has been locked by this task previously
+ * with ww_mutex_lock* using an acquire context. It is forbidden to release the
+ * locks after releasing the acquire context.
+ *
+ * This function must not be used in interrupt context. Unlocking
+ * of a unlocked mutex is not allowed.
+ *
+ * Note that locks acquired with one of the ww_mutex_lock*single variant must be
+ * unlocked with ww_mutex_unlock_single.
+ */
+void __sched ww_mutex_unlock(struct ww_mutex *lock)
+{
+	/*
+	 * The unlocking fastpath is the 0->1 transition from 'locked'
+	 * into 'unlocked' state:
+	 */
+#ifdef CONFIG_DEBUG_MUTEXES
+	DEBUG_LOCKS_WARN_ON(!lock->ctx);
+	if (lock->ctx) {
+		DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
+		if (lock->ctx->acquired > 0)
+			lock->ctx->acquired--;
+	}
+#endif
+	lock->ctx = NULL;
+	smp_mb__before_atomic_inc();
+
+#ifndef CONFIG_DEBUG_MUTEXES
+	/*
+	 * When debugging is enabled we must not clear the owner before time,
+	 * the slow path will always be taken, and that clears the owner field
+	 * after verifying that it was indeed current.
+	 */
+	mutex_clear_owner(&lock->base);
+#endif
+	__mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath);
+}
+EXPORT_SYMBOL(ww_mutex_unlock);
+
+static inline int __sched
+__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+	struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
+
+	if (!hold_ctx)
+		return 0;
+
+	if (unlikely(ctx->stamp == hold_ctx->stamp))
+		return -EALREADY;
+
+	if (unlikely(ctx->stamp - hold_ctx->stamp <= LONG_MAX)) {
+#ifdef CONFIG_DEBUG_MUTEXES
+		DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
+		ctx->contending_lock = ww;
+#endif
+		return -EDEADLK;
+	}
+
+	return 0;
+}
+
+static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
+						   struct ww_acquire_ctx *ww_ctx,
+						   bool ww_slow)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+	/*
+	 * If this WARN_ON triggers, you used mutex_lock to acquire,
+	 * but released with ww_mutex_unlock in this call.
+	 */
+	DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+	/*
+	 * Not quite done after ww_acquire_done() ?
+	 */
+	DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+	if (ww_slow) {
+		DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+		ww_ctx->contending_lock = NULL;
+	} else
+		DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
+
+
+	/*
+	 * Naughty, using a different class can lead to undefined behavior!
+	 */
+	DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+
+	if (ww_slow)
+		DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+
+	ww_ctx->acquired++;
+#endif
+}
+
+/*
+ * after acquiring lock with fastpath or when we lost out in contested
+ * slowpath, set ctx and wake up any waiters so they can recheck.
+ *
+ * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
+ * as the fastpath and opportunistic spinning are disabled in that case.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock,
+			       struct ww_acquire_ctx *ctx)
+{
+	unsigned long flags;
+	struct mutex_waiter *cur;
+
+	ww_mutex_lock_acquired(lock, ctx, false);
+
+	lock->ctx = ctx;
+	smp_mb__after_atomic_dec();
+
+	/*
+	 * Check if lock is contended, if not there is nobody to wake up
+	 */
+	if (likely(atomic_read(&lock->base.count) == 0))
+		return;
+
+	/*
+	 * Uh oh, we raced in fastpath, wake up everyone in this case,
+	 * so they can see the new ctx
+	 */
+	spin_lock_mutex(&lock->base.wait_lock, flags);
+	list_for_each_entry(cur, &lock->base.wait_list, list) {
+		debug_mutex_wake_waiter(&lock->base, cur);
+		wake_up_process(cur->task);
+	}
+	spin_unlock_mutex(&lock->base.wait_lock, flags);
+}
+
 /*
  * Lock a mutex (possibly interruptible), slowpath:
  */
-static inline int __sched
+static __always_inline int __sched
 __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
-		    struct lockdep_map *nest_lock, unsigned long ip)
+		    struct lockdep_map *nest_lock, unsigned long ip,
+		    struct ww_acquire_ctx *ww_ctx, bool ww_slow)
 {
 	struct task_struct *task = current;
 	struct mutex_waiter waiter;
 	unsigned long flags;
+	int ret;
 
 	preempt_disable();
 	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
@@ -163,6 +303,14 @@  __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	for (;;) {
 		struct task_struct *owner;
 
+		if (!__builtin_constant_p(ww_ctx == NULL) && !ww_slow) {
+			struct ww_mutex *ww;
+
+			ww = container_of(lock, struct ww_mutex, base);
+			if (ACCESS_ONCE(ww->ctx))
+				break;
+		}
+
 		/*
 		 * If there's an owner, wait for it to either
 		 * release the lock or go to sleep.
@@ -173,6 +321,13 @@  __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
 			lock_acquired(&lock->dep_map, ip);
+			if (ww_slow) {
+				struct ww_mutex *ww;
+				ww = container_of(lock, struct ww_mutex, base);
+
+				ww_mutex_set_context_fastpath(ww, ww_ctx);
+			}
+
 			mutex_set_owner(lock);
 			preempt_enable();
 			return 0;
@@ -228,15 +383,16 @@  __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * TASK_UNINTERRUPTIBLE case.)
 		 */
 		if (unlikely(signal_pending_state(state, task))) {
-			mutex_remove_waiter(lock, &waiter,
-					    task_thread_info(task));
-			mutex_release(&lock->dep_map, 1, ip);
-			spin_unlock_mutex(&lock->wait_lock, flags);
+			ret = -EINTR;
+			goto err;
+		}
 
-			debug_mutex_free_waiter(&waiter);
-			preempt_enable();
-			return -EINTR;
+		if (!__builtin_constant_p(ww_ctx == NULL) && !ww_slow) {
+			ret = __mutex_lock_check_stamp(lock, ww_ctx);
+			if (ret)
+				goto err;
 		}
+
 		__set_task_state(task, state);
 
 		/* didn't get the lock, go to sleep: */
@@ -251,6 +407,30 @@  done:
 	mutex_remove_waiter(lock, &waiter, current_thread_info());
 	mutex_set_owner(lock);
 
+	if (!__builtin_constant_p(ww_ctx == NULL)) {
+		struct ww_mutex *ww = container_of(lock,
+						      struct ww_mutex,
+						      base);
+		struct mutex_waiter *cur;
+
+		/*
+		 * This branch gets optimized out for the common case,
+		 * and is only important for ww_mutex_lock.
+		 */
+
+		ww_mutex_lock_acquired(ww, ww_ctx, ww_slow);
+		ww->ctx = ww_ctx;
+
+		/*
+		 * Give any possible sleeping processes the chance to wake up,
+		 * so they can recheck if they have to back off.
+		 */
+		list_for_each_entry(cur, &lock->wait_list, list) {
+			debug_mutex_wake_waiter(lock, cur);
+			wake_up_process(cur->task);
+		}
+	}
+
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
 		atomic_set(&lock->count, 0);
@@ -261,6 +441,14 @@  done:
 	preempt_enable();
 
 	return 0;
+
+err:
+	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+	spin_unlock_mutex(&lock->wait_lock, flags);
+	debug_mutex_free_waiter(&waiter);
+	mutex_release(&lock->dep_map, 1, ip);
+	preempt_enable();
+	return ret;
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -268,7 +456,8 @@  void __sched
 mutex_lock_nested(struct mutex *lock, unsigned int subclass)
 {
 	might_sleep();
-	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+			    subclass, NULL, _RET_IP_, 0, 0);
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -277,7 +466,8 @@  void __sched
 _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
 {
 	might_sleep();
-	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
+	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+			    0, nest, _RET_IP_, 0, 0);
 }
 
 EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
@@ -286,7 +476,8 @@  int __sched
 mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
 {
 	might_sleep();
-	return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
+	return __mutex_lock_common(lock, TASK_KILLABLE,
+				   subclass, NULL, _RET_IP_, 0, 0);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
 
@@ -295,10 +486,156 @@  mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
 	might_sleep();
 	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
-				   subclass, NULL, _RET_IP_);
+				   subclass, NULL, _RET_IP_, 0, 0);
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+
+
+/**
+ * ww_mutex_lock - acquire the w/w mutex
+ * @lock: the mutex to be acquired
+ * @ctx: w/w acquire context
+ *
+ * Lock the w/w mutex exclusively for this task.
+ *
+ * Deadlocks within a given w/w class of locks are detected and handled with the
+ * wait/wound algorithm. If the lock isn't immediately avaiable this function
+ * will either sleep until it is (wait case). Or it selects the current context
+ * for backing off by returning -EDEADLK (wound case). Trying to acquire the
+ * same lock with the same context twice is also detected and signalled by
+ * returning -EALREADY. Returns 0 if the mutex was successfully acquired.
+ *
+ * In the wound case the caller must release all currently held w/w mutexes for
+ * the given context and then wait for this contending lock to be available by
+ * calling ww_mutex_lock_slow. Alternatively callers can opt to not acquire this
+ * lock and proceed with trying to acquire further w/w mutexes (e.g. when
+ * scanning through lru lists trying to free resources).
+ *
+ * The mutex must later on be released by the same task that
+ * acquired it. The task may not exit without first unlocking the mutex. Also,
+ * kernel memory where the mutex resides mutex must not be freed with the mutex
+ * still locked. The mutex must first be initialized (or statically defined)
+ * before it can be locked. memset()-ing the mutex to 0 is not allowed. The
+ * mutex must be of the same w/w lock class as was used to initialize the
+ * acquire context.
+ *
+ * A mutex acquired with this function must be released with ww_mutex_unlock.
+ *
+ * This function is similar to (but not equivalent to) mutex_lock().
+ */
+int __sched
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	might_sleep();
+	return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
+				   0, &ctx->dep_map, _RET_IP_, ctx, 0);
+}
+EXPORT_SYMBOL_GPL(ww_mutex_lock);
+
+/**
+ * ww_mutex_lock_interruptible - acquire the w/w mutex, interruptible
+ * @lock: the mutex to be acquired
+ * @ctx: w/w acquire context
+ *
+ * Lock the w/w mutex exclusively for this task.
+ *
+ * Deadlocks within a given w/w class of locks are detected and handled with the
+ * wait/wound algorithm. If the lock isn't immediately avaiable this function
+ * will either sleep until it is (wait case). Or it selects the current context
+ * for backing off by returning -EDEADLK (wound case). Trying to acquire the
+ * same lock with the same context twice is also detected and signalled by
+ * returning -EALREADY. Returns 0 if the mutex was successfully acquired. If a
+ * signal arrives while waiting for the lock then this function returns -EINTR.
+ *
+ * In the wound case the caller must release all currently held w/w mutexes for
+ * the given context and then wait for this contending lock to be available by
+ * calling ww_mutex_lock_slow_interruptible. Alternatively callers can opt to
+ * not acquire this lock and proceed with trying to acquire further w/w mutexes
+ * (e.g. when scanning through lru lists trying to free resources).
+ *
+ * The mutex must later on be released by the same task that
+ * acquired it. The task may not exit without first unlocking the mutex. Also,
+ * kernel memory where the mutex resides mutex must not be freed with the mutex
+ * still locked. The mutex must first be initialized (or statically defined)
+ * before it can be locked. memset()-ing the mutex to 0 is not allowed. The
+ * mutex must be of the same w/w lock class as was used to initialize the
+ * acquire context.
+ *
+ * A mutex acquired with this function must be released with ww_mutex_unlock.
+ *
+ * This function is similar to (but not equivalent to) mutex_lock_interruptible().
+ */
+int __sched
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	might_sleep();
+	return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
+				   0, &ctx->dep_map, _RET_IP_, ctx, 0);
+}
+EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
+
+/**
+ * ww_mutex_lock_slow - slowpath acquiring of the w/w mutex
+ * @lock: the mutex to be acquired
+ * @ctx: w/w acquire context
+ *
+ * Acquires a w/w mutex with the given context after a wound case. This function
+ * will sleep until the lock becomes available.
+ *
+ * The caller must have released all w/w mutexes already acquired with the
+ * context and then call this function on the contended lock.
+ *
+ * Afterwards the caller may continue to (re)acquire the other w/w mutexes it
+ * needs with ww_mutex_lock. Note that the -EALREADY return code from
+ * ww_mutex_lock can be used to avoid locking this contended mutex twice.
+ *
+ * It is forbidden to call this function with any other w/w mutexes associated
+ * with the context held. It is forbidden to call this on anything else than the
+ * contending mutex.
+ */
+void __sched
+ww_mutex_lock_slow(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	might_sleep();
+	__mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
+			    0, &ctx->dep_map, _RET_IP_, ctx, 1);
+}
+EXPORT_SYMBOL_GPL(ww_mutex_lock_slow);
+
+/**
+ * ww_mutex_lock_slow_interruptible - slowpath acquiring of the w/w mutex,
+ * 				      interruptible
+ * @lock: the mutex to be acquired
+ * @ctx: w/w acquire context
+ *
+ * Acquires a w/w mutex with the given context after a wound case. This function
+ * will sleep until the lock becomes available and returns 0 when the lock has
+ * been acquired. If a signal arrives while waiting for the lock then this
+ * function returns -EINTR.
+ *
+ * The caller must have released all w/w mutexes already acquired with the
+ * context and then call this function on the contended lock.
+ *
+ * Afterwards the caller may continue to (re)acquire the other w/w mutexes it
+ * needs with ww_mutex_lock. Note that the -EALREADY return code from
+ * ww_mutex_lock can be used to avoid locking this contended mutex twice.
+ *
+ * It is forbidden to call this function with any other w/w mutexes associated
+ * with the given context held. It is forbidden to call this on anything else
+ * than the contending mutex.
+ */
+int __sched
+ww_mutex_lock_slow_interruptible(struct ww_mutex *lock,
+				 struct ww_acquire_ctx *ctx)
+{
+	might_sleep();
+
+	return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
+				   0, &ctx->dep_map, _RET_IP_, ctx, 1);
+}
+EXPORT_SYMBOL_GPL(ww_mutex_lock_slow_interruptible);
+
 #endif
 
 /*
@@ -401,20 +738,39 @@  __mutex_lock_slowpath(atomic_t *lock_count)
 {
 	struct mutex *lock = container_of(lock_count, struct mutex, count);
 
-	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
+			    NULL, _RET_IP_, 0, 0);
 }
 
 static noinline int __sched
 __mutex_lock_killable_slowpath(struct mutex *lock)
 {
-	return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
+	return __mutex_lock_common(lock, TASK_KILLABLE, 0,
+				   NULL, _RET_IP_, 0, 0);
 }
 
 static noinline int __sched
 __mutex_lock_interruptible_slowpath(struct mutex *lock)
 {
-	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
+	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
+				   NULL, _RET_IP_, 0, 0);
 }
+
+static noinline int __sched
+__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
+				   NULL, _RET_IP_, ctx, 0);
+}
+
+static noinline int __sched
+__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
+					    struct ww_acquire_ctx *ctx)
+{
+	return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
+				   NULL, _RET_IP_, ctx, 0);
+}
+
 #endif
 
 /*
@@ -470,6 +826,63 @@  int __sched mutex_trylock(struct mutex *lock)
 }
 EXPORT_SYMBOL(mutex_trylock);
 
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+int __sched
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	int ret;
+
+	might_sleep();
+
+	ret = __mutex_fastpath_lock_retval(&lock->base.count);
+
+	if (likely(!ret)) {
+		ww_mutex_set_context_fastpath(lock, ctx);
+		mutex_set_owner(&lock->base);
+	} else
+		ret = __ww_mutex_lock_slowpath(lock, ctx);
+	return ret;
+}
+EXPORT_SYMBOL(ww_mutex_lock);
+
+int __sched
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	int ret;
+
+	might_sleep();
+
+	ret = __mutex_fastpath_lock_retval(&lock->base.count);
+
+	if (likely(!ret)) {
+		ww_mutex_set_context_fastpath(lock, ctx);
+		mutex_set_owner(&lock->base);
+	} else
+		ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx);
+	return ret;
+}
+EXPORT_SYMBOL(ww_mutex_lock_interruptible);
+
+void __sched
+ww_mutex_lock_slow(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	might_sleep();
+	__mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
+			    0, NULL, _RET_IP_, ctx, 1);
+}
+EXPORT_SYMBOL(ww_mutex_lock_slow);
+
+int __sched
+ww_mutex_lock_slow_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	might_sleep();
+	return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
+				   0, NULL, _RET_IP_, ctx, 1);
+}
+EXPORT_SYMBOL(ww_mutex_lock_slow_interruptible);
+
+#endif
+
 /**
  * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
  * @cnt: the atomic which we are to dec
diff --git a/lib/debug_locks.c b/lib/debug_locks.c
index f2fa60c..96c4c63 100644
--- a/lib/debug_locks.c
+++ b/lib/debug_locks.c
@@ -30,6 +30,7 @@  EXPORT_SYMBOL_GPL(debug_locks);
  * a locking bug is detected.
  */
 int debug_locks_silent;
+EXPORT_SYMBOL_GPL(debug_locks_silent);
 
 /*
  * Generic 'turn off all lock debugging' function:
@@ -44,3 +45,4 @@  int debug_locks_off(void)
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(debug_locks_off);