mirror of
https://github.com/vincentmli/bpfire.git
synced 2026-04-28 03:33:25 +02:00
241 lines
7.5 KiB
Diff
241 lines
7.5 KiB
Diff
Subject: Optimise smp_{r,w}mb and mutex
|
|
From: Nick Piggin <npiggin@suse.de>
|
|
References: 471222 - LTC51356
|
|
|
|
powerpc: Optimise smp_wmb
|
|
|
|
Change 2d1b2027626d5151fff8ef7c06ca8e7876a1a510 ("powerpc: Fixup
|
|
lwsync at runtime") removed __SUBARCH_HAS_LWSYNC, causing smp_wmb to
|
|
revert back to eieio for all CPUs. This restores the behaviour
|
|
intorduced in 74f0609526afddd88bef40b651da24f3167b10b2 ("powerpc:
|
|
Optimise smp_wmb on 64-bit processors").
|
|
|
|
powerpc: Optimise smp_rmb
|
|
|
|
After commit 598056d5af8fef1dbe8f96f5c2b641a528184e5a ("[POWERPC] Fix
|
|
rmb to order cacheable vs. noncacheable"), rmb() becomes a sync
|
|
instruction, which is needed to order cacheable vs noncacheable loads.
|
|
However smp_rmb() is #defined to rmb(), and smp_rmb() can be an
|
|
lwsync.
|
|
|
|
This restores smp_rmb() performance by using lwsync there and updates
|
|
the comments.
|
|
|
|
powerpc: Optimise mutex
|
|
|
|
This implements an optimised mutex fastpath for powerpc, making use of
|
|
acquire and release barrier semantics. This takes the mutex
|
|
lock+unlock benchmark from 203 to 173 cycles on a G5.
|
|
|
|
Signed-off-by: Nick Piggin <npiggin@suse.de>
|
|
Signed-off-by: Paul Mackerras <paulus@samba.org>
|
|
Signed-off-by: Olaf Hering <olh@suse.de>
|
|
|
|
---
|
|
arch/powerpc/include/asm/mutex.h | 135 ++++++++++++++++++++++++++++++++++++--
|
|
arch/powerpc/include/asm/synch.h | 4 +
|
|
arch/powerpc/include/asm/system.h | 24 +++---
|
|
3 files changed, 147 insertions(+), 16 deletions(-)
|
|
|
|
--- a/arch/powerpc/include/asm/mutex.h
|
|
+++ b/arch/powerpc/include/asm/mutex.h
|
|
@@ -1,9 +1,134 @@
|
|
/*
|
|
- * Pull in the generic implementation for the mutex fastpath.
|
|
+ * Optimised mutex implementation of include/asm-generic/mutex-dec.h algorithm
|
|
+ */
|
|
+#ifndef _ASM_POWERPC_MUTEX_H
|
|
+#define _ASM_POWERPC_MUTEX_H
|
|
+
|
|
+static inline int __mutex_cmpxchg_lock(atomic_t *v, int old, int new)
|
|
+{
|
|
+ int t;
|
|
+
|
|
+ __asm__ __volatile__ (
|
|
+"1: lwarx %0,0,%1 # mutex trylock\n\
|
|
+ cmpw 0,%0,%2\n\
|
|
+ bne- 2f\n"
|
|
+ PPC405_ERR77(0,%1)
|
|
+" stwcx. %3,0,%1\n\
|
|
+ bne- 1b"
|
|
+ ISYNC_ON_SMP
|
|
+ "\n\
|
|
+2:"
|
|
+ : "=&r" (t)
|
|
+ : "r" (&v->counter), "r" (old), "r" (new)
|
|
+ : "cc", "memory");
|
|
+
|
|
+ return t;
|
|
+}
|
|
+
|
|
+static inline int __mutex_dec_return_lock(atomic_t *v)
|
|
+{
|
|
+ int t;
|
|
+
|
|
+ __asm__ __volatile__(
|
|
+"1: lwarx %0,0,%1 # mutex lock\n\
|
|
+ addic %0,%0,-1\n"
|
|
+ PPC405_ERR77(0,%1)
|
|
+" stwcx. %0,0,%1\n\
|
|
+ bne- 1b"
|
|
+ ISYNC_ON_SMP
|
|
+ : "=&r" (t)
|
|
+ : "r" (&v->counter)
|
|
+ : "cc", "memory");
|
|
+
|
|
+ return t;
|
|
+}
|
|
+
|
|
+static inline int __mutex_inc_return_unlock(atomic_t *v)
|
|
+{
|
|
+ int t;
|
|
+
|
|
+ __asm__ __volatile__(
|
|
+ LWSYNC_ON_SMP
|
|
+"1: lwarx %0,0,%1 # mutex unlock\n\
|
|
+ addic %0,%0,1\n"
|
|
+ PPC405_ERR77(0,%1)
|
|
+" stwcx. %0,0,%1 \n\
|
|
+ bne- 1b"
|
|
+ : "=&r" (t)
|
|
+ : "r" (&v->counter)
|
|
+ : "cc", "memory");
|
|
+
|
|
+ return t;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * __mutex_fastpath_lock - try to take the lock by moving the count
|
|
+ * from 1 to a 0 value
|
|
+ * @count: pointer of type atomic_t
|
|
+ * @fail_fn: function to call if the original value was not 1
|
|
+ *
|
|
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
|
|
+ * it wasn't 1 originally. This function MUST leave the value lower than
|
|
+ * 1 even when the "1" assertion wasn't true.
|
|
+ */
|
|
+static inline void
|
|
+__mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
|
|
+{
|
|
+ if (unlikely(__mutex_dec_return_lock(count) < 0))
|
|
+ fail_fn(count);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * __mutex_fastpath_lock_retval - try to take the lock by moving the count
|
|
+ * from 1 to a 0 value
|
|
+ * @count: pointer of type atomic_t
|
|
+ * @fail_fn: function to call if the original value was not 1
|
|
+ *
|
|
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
|
|
+ * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
|
|
+ * or anything the slow path function returns.
|
|
+ */
|
|
+static inline int
|
|
+__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
|
|
+{
|
|
+ if (unlikely(__mutex_dec_return_lock(count) < 0))
|
|
+ return fail_fn(count);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * __mutex_fastpath_unlock - try to promote the count from 0 to 1
|
|
+ * @count: pointer of type atomic_t
|
|
+ * @fail_fn: function to call if the original value was not 0
|
|
+ *
|
|
+ * Try to promote the count from 0 to 1. If it wasn't 0, call <fail_fn>.
|
|
+ * In the failure case, this function is allowed to either set the value to
|
|
+ * 1, or to set it to a value lower than 1.
|
|
+ */
|
|
+static inline void
|
|
+__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
|
|
+{
|
|
+ if (unlikely(__mutex_inc_return_unlock(count) <= 0))
|
|
+ fail_fn(count);
|
|
+}
|
|
+
|
|
+#define __mutex_slowpath_needs_to_unlock() 1
|
|
+
|
|
+/**
|
|
+ * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
|
|
+ *
|
|
+ * @count: pointer of type atomic_t
|
|
+ * @fail_fn: fallback function
|
|
*
|
|
- * TODO: implement optimized primitives instead, or leave the generic
|
|
- * implementation in place, or pick the atomic_xchg() based generic
|
|
- * implementation. (see asm-generic/mutex-xchg.h for details)
|
|
+ * Change the count from 1 to 0, and return 1 (success), or if the count
|
|
+ * was not 1, then return 0 (failure).
|
|
*/
|
|
+static inline int
|
|
+__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
|
|
+{
|
|
+ if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1))
|
|
+ return 1;
|
|
+ return 0;
|
|
+}
|
|
|
|
-#include <asm-generic/mutex-dec.h>
|
|
+#endif
|
|
--- a/arch/powerpc/include/asm/synch.h
|
|
+++ b/arch/powerpc/include/asm/synch.h
|
|
@@ -5,6 +5,10 @@
|
|
#include <linux/stringify.h>
|
|
#include <asm/feature-fixups.h>
|
|
|
|
+#if defined(__powerpc64__) || defined(CONFIG_PPC_E500MC)
|
|
+#define __SUBARCH_HAS_LWSYNC
|
|
+#endif
|
|
+
|
|
#ifndef __ASSEMBLY__
|
|
extern unsigned int __start___lwsync_fixup, __stop___lwsync_fixup;
|
|
extern void do_lwsync_fixups(unsigned long value, void *fixup_start,
|
|
--- a/arch/powerpc/include/asm/system.h
|
|
+++ b/arch/powerpc/include/asm/system.h
|
|
@@ -23,15 +23,17 @@
|
|
* read_barrier_depends() prevents data-dependent loads being reordered
|
|
* across this point (nop on PPC).
|
|
*
|
|
- * We have to use the sync instructions for mb(), since lwsync doesn't
|
|
- * order loads with respect to previous stores. Lwsync is fine for
|
|
- * rmb(), though. Note that rmb() actually uses a sync on 32-bit
|
|
- * architectures.
|
|
+ * *mb() variants without smp_ prefix must order all types of memory
|
|
+ * operations with one another. sync is the only instruction sufficient
|
|
+ * to do this.
|
|
*
|
|
- * For wmb(), we use sync since wmb is used in drivers to order
|
|
- * stores to system memory with respect to writes to the device.
|
|
- * However, smp_wmb() can be a lighter-weight lwsync or eieio barrier
|
|
- * on SMP since it is only used to order updates to system memory.
|
|
+ * For the smp_ barriers, ordering is for cacheable memory operations
|
|
+ * only. We have to use the sync instruction for smp_mb(), since lwsync
|
|
+ * doesn't order loads with respect to previous stores. Lwsync can be
|
|
+ * used for smp_rmb() and smp_wmb().
|
|
+ *
|
|
+ * However, on CPUs that don't support lwsync, lwsync actually maps to a
|
|
+ * heavy-weight sync, so smp_wmb() can be a lighter-weight eieio.
|
|
*/
|
|
#define mb() __asm__ __volatile__ ("sync" : : : "memory")
|
|
#define rmb() __asm__ __volatile__ ("sync" : : : "memory")
|
|
@@ -45,14 +47,14 @@
|
|
#ifdef CONFIG_SMP
|
|
|
|
#ifdef __SUBARCH_HAS_LWSYNC
|
|
-# define SMPWMB lwsync
|
|
+# define SMPWMB LWSYNC
|
|
#else
|
|
# define SMPWMB eieio
|
|
#endif
|
|
|
|
#define smp_mb() mb()
|
|
-#define smp_rmb() rmb()
|
|
-#define smp_wmb() __asm__ __volatile__ (__stringify(SMPWMB) : : :"memory")
|
|
+#define smp_rmb() __asm__ __volatile__ (stringify_in_c(LWSYNC) : : :"memory")
|
|
+#define smp_wmb() __asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory")
|
|
#define smp_read_barrier_depends() read_barrier_depends()
|
|
#else
|
|
#define smp_mb() barrier()
|