Prefetchw stickiness cache block
Author: Guo Ren <guoren@kernel.org>
Agenda
Problem
(LR/SC hasn’t forward progress guarantee)
Case1: Qspinlock requirement
include/asm-generic/qspinlock.h:
/*
* Queued spinlock
*
* A 'generic' spinlock implementation that is based on MCS locks.
…
* qspinlock relies on a far greater (compared to asm-generic/spinlock.h) set
* of atomic operations to behave well together, please audit them carefully to
* ensure they all have forward progress. Many atomic operations may default to
* cmpxchg() loops which will not have good forward progress properties on
* LL/SC architectures.
…
*/
xchg_tail starvation risk
Core0
(Big)
Core1
(Big)
Core2
(Big)
Core3
(Big)
Core4
(Big)
Core5
(Big)
Core6
(Little)
Core7
(Big)
Core8
(Big)
xchg_tail�(contend)
Core0
(Big)
locked
pending
mcs_head
mcs_tail
…
mcs_queued
starving
unlock
Core 6 may starving
Qspinlock xchg_tail (NR_CPUS < 16K)
static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
return (u32) xchg16_relaxed(&lock->tail,
tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
}
No sub-word atomic in RISC-V base ISA, then:�The equivalent implementation is based on LR/SC.word.
Bitfileds of struct qspinlock:� 0 - 15: locked_pending�16 - 31: tail (index[2] & cpu[14])
static inline ulong __xchg16_relaxed(ulong new, void *ptr)
{
ulong ret, tmp;
ulong shif = ((ulong)ptr & 2) ? 16 : 0;
ulong mask = 0xffff << shif;
ulong *__ptr = (ulong *)((ulong)ptr & ~2);
__asm__ __volatile__ (
"0: lr.w %0, %2\n"
" and %1, %0, %z3\n"
" or %1, %1, %z4\n"
" sc.w %1, %1, %2\n"
" bnez %1, 0b\n"
: "=&r" (ret), "=&r" (tmp), "+A" (*__ptr)
: "rJ" (~mask), "rJ" (new << shif)
: "memory");
return (ulong)((ret & mask) >> shif);
}
Qspinlock xchg_tail (NR_CPUS >= 16K)
static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
u32 old, new, val = atomic_read(&lock->val);
for (;;) {
new = (val & _Q_LOCKED_PENDING_MASK) | tail;
…
old = atomic_cmpxchg_relaxed(&lock->val, val, new);
if (old == val)
break;
val = old;
}
return old;
}
#define __cmpxchg_relaxed(ptr, old, new, size) \
({ \
…
__asm__ __volatile__ ( \
"0: lr.w %0, %2\n" \
" bne %0, %z3, 1f\n" \
" sc.w %1, %z4, %2\n" \
" bnez %1, 0b\n" \
"1:\n" \
When xchg_tail contend with other AMOs, the cmpxchg needs to result loop break to guarantee forward progress.
Bitfileds of struct qspinlock:� 0 - 8: locked_pending� 9 - 31: tail (index[2] & cpu[21])
Case2: Lockref fastpath
#define CMPXCHG_LOOP(CODE, SUCCESS) do { \
int retry = 100; \
struct lockref old; \
BUILD_BUG_ON(sizeof(old) != 8); \
old.lock_count = READ_ONCE(lockref->lock_count); \
while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) { \
struct lockref new = old; \
CODE \
if (likely(try_cmpxchg64_relaxed(&lockref->lock_count, \
&old.lock_count, \
new.lock_count))) { \
SUCCESS; \
} \
if (!--retry) \
break; \
} \
} while (0)
https://github.com/jchandra-cavm/refcount-test/
Solution
(Prefetch.w)
RISC-V Base Cache Management Operation ISA Extensions
Microarch Guarantee
Enhance xchg_tail by prefetch.w (NR_CPUS < 16K)
static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
return (u32) xchg16_relaxed(&lock->tail,
tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
}
static inline ulong __xchg16_relaxed(ulong new, void *ptr)
{
ulong ret, tmp;
ulong shif = ((ulong)ptr & 2) ? 16 : 0;
ulong mask = 0xffff << shif;
ulong *__ptr = (ulong *)((ulong)ptr & ~2);
__asm__ __volatile__ (�+ " prefetch.w %2\n"
"0: lr.w %0, %2\n"
" and %1, %0, %z3\n"
" or %1, %1, %z4\n"
" sc.w %1, %1, %2\n"
" bnez %1, 0b\n"
: "=&r" (ret), "=&r" (tmp), "+A" (*__ptr)
: "rJ" (~mask), "rJ" (new << shif)
: "memory");
return (ulong)((ret & mask) >> shif);
}
Enhance xchg_tail by prefetch.w (NR_CPUS >= 16K)
static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{�+ prefetchw(&lock->val);
u32 old, new, val = atomic_read(&lock->val);
for (;;) {
new = (val & _Q_LOCKED_PENDING_MASK) | tail;
…
old = atomic_cmpxchg_relaxed(&lock->val, val, new);
if (old == val)
break;
val = old;
}
return old;
}
#define __cmpxchg_relaxed(ptr, old, new, size) \
({ \
…
__asm__ __volatile__ ( \
"0: lr.w %0, %2\n" \
" bne %0, %z3, 1f\n" \
" sc.w %1, %z4, %2\n" \
" bnez %1, 0b\n" \
"1:\n" \
Enhance lockref_get by prefetch.w
#define CMPXCHG_LOOP(CODE, SUCCESS) do { \
int retry = 100; \
struct lockref old; \
BUILD_BUG_ON(sizeof(old) != 8); \
+ prefetchw(lockref); \
old.lock_count = READ_ONCE(lockref->lock_count); \
while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) { \
struct lockref new = old; \
CODE \
if (likely(try_cmpxchg64_relaxed(&lockref->lock_count, \
&old.lock_count, \
new.lock_count))) { \
SUCCESS; \
} \
if (!--retry) \
break; \
} \
} while (0)
void lockref_get(struct lockref *lockref)
{
CMPXCHG_LOOP(
new.count++;
,
return;
);
spin_lock(&lockref->lock);
lockref->count++;
spin_unlock(&lockref->lock);
}
EXPORT_SYMBOL(lockref_get);
Benifits of prefetch.w
Discussion
(Cmpxchg Mapping)
prefetch.w a0
lr.w t0, (a0)
Arm64 Example: cmpxchg with pretch.w
PRFM PSTL1STRM | |
PRFM | Prefetch Memory (register) |
PST | Prefetch for store, encoded in the "Rt<4:3>" field as 0b10. |
L1 | Level 1 cache, encoded in the "Rt<2:1>" field as 0b00. |
STRM | Streaming or non-temporal prefetch, for data that is used only once. Encoded in the "Rt<0>" field as 1. |
Preload instructions
In AArch64 state, the Cortex-A73 processor supports the PRFM (Prefetch Memory) instructions which signal to the memory system that memory accesses from a specified address are likely to occur in the near future.��Any linefill started by a PLDW instruction causes the data to be invalidated in other cores, so that the line is ready to be written to.
Only for cache miss, not enough, prefetch.w want more :-P
Force LR built-in prefetch.w? (No zawrs, more radical!)
RISCV-BOOM:
https://github.com/riscv-boom/riscv-boom/blob/v3.0.0/src/main/scala/lsu/dcache.scala#L650
OpenXiangShan:
Use “lrscCycles - lrscBackoff” to temporarily block the cache snoop channel to give a stronger forward progress guarantee.�(cache-inhibited during LR/SC accesses)
def lrscCycles = 100�def lrscBackoff = 3
def lrscCycles = 80
def lrscBackoff = ?
Is prefetch.w proper for cmpxchg?
A controversial implementation for micro-arch, right?
Conditional atomic with prefetch.w?
static __always_inline int
raw_atomic_fetch_add_unless(atomic_t *v, int a, int u)
{
#if defined(arch_atomic_fetch_add_unless)
return arch_atomic_fetch_add_unless(v, a, u);
#else
int c = raw_atomic_read(v);
do {
if (unlikely(c == u))
break;� prefetchw(v);
} while (!raw_atomic_try_cmpxchg(v, &c, c + a));
return c;
#endif
}
static __always_inline bool
raw_atomic_inc_unless_negative(atomic_t *v)
{
#if defined(arch_atomic_inc_unless_negative)
return arch_atomic_inc_unless_negative(v);
#else
int c = raw_atomic_read(v);
do {
if (unlikely(c < 0))
return false;� prefetchw(v);
} while (!raw_atomic_try_cmpxchg(v, &c, c + 1));
return true;
#endif
}
static __always_inline int
raw_atomic_dec_if_positive(atomic_t *v)
{
#if defined(arch_atomic_dec_if_positive)
return arch_atomic_dec_if_positive(v);
#else
int dec, c = raw_atomic_read(v);
do {
dec = c - 1;
if (unlikely(dec < 0))
break;
prefetchw(v);
} while (!raw_atomic_try_cmpxchg(v, &c, dec));
return dec;
#endif
}
static __always_inline bool
raw_atomic_dec_unless_positive(atomic_t *v)
{
#if defined(arch_atomic_dec_unless_positive)
return arch_atomic_dec_unless_positive(v);
#else
int c = raw_atomic_read(v);
do {
if (unlikely(c > 0))
return false;� prefetchw(v);
} while (!raw_atomic_try_cmpxchg(v, &c, c - 1));
return true;
#endif
}
Thank you