1 of 21

Prefetchw stickiness cache block

Author: Guo Ren <guoren@kernel.org>

2 of 21

Agenda

Problem (LR/SC hasn’t forward progress guarantee)
Solution (Prefetch.w)
Discussion (Cmpxchg Mapping)

3 of 21

Problem

(LR/SC hasn’t forward progress guarantee)

4 of 21

Case1: Qspinlock requirement

include/asm-generic/qspinlock.h:

/*

* Queued spinlock

*

* A 'generic' spinlock implementation that is based on MCS locks.

…

* qspinlock relies on a far greater (compared to asm-generic/spinlock.h) set

* of atomic operations to behave well together, please audit them carefully to

* ensure they all have forward progress. Many atomic operations may default to

* cmpxchg() loops which will not have good forward progress properties on

* LL/SC architectures.

…

*/

5 of 21

xchg_tail starvation risk

Core0

(Big)

Core1

(Big)

Core2

(Big)

Core3

(Big)

Core4

(Big)

Core5

(Big)

Core6

(Little)

Core7

(Big)

Core8

(Big)

xchg_tail�(contend)

Core0

(Big)

locked

pending

mcs_head

mcs_tail

…

mcs_queued

starving

unlock

Core 6 may starving

6 of 21

Qspinlock xchg_tail (NR_CPUS < 16K)

static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)

{

return (u32) xchg16_relaxed(&lock->tail,

tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;

}

No sub-word atomic in RISC-V base ISA, then:�The equivalent implementation is based on LR/SC.word.

Bitfileds of struct qspinlock:� 0 - 15: locked_pending�16 - 31: tail (index[2] & cpu[14])

static inline ulong __xchg16_relaxed(ulong new, void *ptr)

{

ulong ret, tmp;

ulong shif = ((ulong)ptr & 2) ? 16 : 0;

ulong mask = 0xffff << shif;

ulong *__ptr = (ulong *)((ulong)ptr & ~2);

__asm__ __volatile__ (

"0: lr.w %0, %2\n"

" and %1, %0, %z3\n"

" or %1, %1, %z4\n"

" sc.w %1, %1, %2\n"

" bnez %1, 0b\n"

: "=&r" (ret), "=&r" (tmp), "+A" (*__ptr)

: "rJ" (~mask), "rJ" (new << shif)

: "memory");

return (ulong)((ret & mask) >> shif);

}

7 of 21

Qspinlock xchg_tail (NR_CPUS >= 16K)

static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)

{

u32 old, new, val = atomic_read(&lock->val);

for (;;) {

new = (val & _Q_LOCKED_PENDING_MASK) | tail;

…

old = atomic_cmpxchg_relaxed(&lock->val, val, new);

if (old == val)

break;

val = old;

}

return old;

}

#define __cmpxchg_relaxed(ptr, old, new, size) \

({ \

…

__asm__ __volatile__ ( \

"0: lr.w %0, %2\n" \

" bne %0, %z3, 1f\n" \

" sc.w %1, %z4, %2\n" \

" bnez %1, 0b\n" \

"1:\n" \

When xchg_tail contend with other AMOs, the cmpxchg needs to result loop break to guarantee forward progress.

Bitfileds of struct qspinlock:� 0 - 8: locked_pending� 9 - 31: tail (index[2] & cpu[21])

8 of 21

Case2: Lockref fastpath

#define CMPXCHG_LOOP(CODE, SUCCESS) do { \

int retry = 100; \

struct lockref old; \

BUILD_BUG_ON(sizeof(old) != 8); \

old.lock_count = READ_ONCE(lockref->lock_count); \

while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) { \

struct lockref new = old; \

CODE \

if (likely(try_cmpxchg64_relaxed(&lockref->lock_count, \

&old.lock_count, \

new.lock_count))) { \

SUCCESS; \

} \

if (!--retry) \

break; \

} \

} while (0)

https://github.com/jchandra-cavm/refcount-test/

9 of 21

Solution

（Prefetch.w）

10 of 21

RISC-V Base Cache Management Operation ISA Extensions

11 of 21

Microarch Guarantee

Prefetch.w must guarantee cache line exclusiveness even when a shareable state cache line hits.
Hold the exclusive cache line for several cycles until the next store or timeout
Mask interrupt during the holding cycles (Optional)

12 of 21

Enhance xchg_tail by prefetch.w (NR_CPUS < 16K)

static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)

{

return (u32) xchg16_relaxed(&lock->tail,

tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;

}

static inline ulong __xchg16_relaxed(ulong new, void *ptr)

{

ulong ret, tmp;

ulong shif = ((ulong)ptr & 2) ? 16 : 0;

ulong mask = 0xffff << shif;

ulong *__ptr = (ulong *)((ulong)ptr & ~2);

__asm__ __volatile__ (�+ " prefetch.w %2\n"

"0: lr.w %0, %2\n"

" and %1, %0, %z3\n"

" or %1, %1, %z4\n"

" sc.w %1, %1, %2\n"

" bnez %1, 0b\n"

: "=&r" (ret), "=&r" (tmp), "+A" (*__ptr)

: "rJ" (~mask), "rJ" (new << shif)

: "memory");

return (ulong)((ret & mask) >> shif);

}

13 of 21

Enhance xchg_tail by prefetch.w (NR_CPUS >= 16K)

static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)

{�+ prefetchw(&lock->val);

u32 old, new, val = atomic_read(&lock->val);

for (;;) {

new = (val & _Q_LOCKED_PENDING_MASK) | tail;

…

old = atomic_cmpxchg_relaxed(&lock->val, val, new);

if (old == val)

break;

val = old;

}

return old;

}

#define __cmpxchg_relaxed(ptr, old, new, size) \

({ \

…

__asm__ __volatile__ ( \

"0: lr.w %0, %2\n" \

" bne %0, %z3, 1f\n" \

" sc.w %1, %z4, %2\n" \

" bnez %1, 0b\n" \

"1:\n" \

14 of 21

Enhance lockref_get by prefetch.w

#define CMPXCHG_LOOP(CODE, SUCCESS) do { \

int retry = 100; \

struct lockref old; \

BUILD_BUG_ON(sizeof(old) != 8); \

+ prefetchw(lockref); \

old.lock_count = READ_ONCE(lockref->lock_count); \

while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) { \

struct lockref new = old; \

CODE \

if (likely(try_cmpxchg64_relaxed(&lockref->lock_count, \

&old.lock_count, \

new.lock_count))) { \

SUCCESS; \

} \

if (!--retry) \

break; \

} \

} while (0)

void lockref_get(struct lockref *lockref)

{

CMPXCHG_LOOP(

new.count++;

,

return;

);

spin_lock(&lockref->lock);

lockref->count++;

spin_unlock(&lockref->lock);

}

EXPORT_SYMBOL(lockref_get);

15 of 21

Benifits of prefetch.w

The prefetch.w could help RISC-V sub-atomic primitive which emulated by LR/SC.
The prefetch.w could give cmpxchg loop’s forward progress guarantee.

16 of 21

Discussion

(Cmpxchg Mapping)

prefetch.w a0

lr.w t0, (a0)

17 of 21

Arm64 Example: cmpxchg with pretch.w

PRFM PSTL1STRM
PRFM	Prefetch Memory (register)
PST	Prefetch for store, encoded in the "Rt<4:3>" field as 0b10.
L1	Level 1 cache, encoded in the "Rt<2:1>" field as 0b00.
STRM	Streaming or non-temporal prefetch, for data that is used only once. Encoded in the "Rt<0>" field as 1.

Preload instructions

In AArch64 state, the Cortex-A73 processor supports the PRFM (Prefetch Memory) instructions which signal to the memory system that memory accesses from a specified address are likely to occur in the near future.��Any linefill started by a PLDW instruction causes the data to be invalidated in other cores, so that the line is ready to be written to.

Only for cache miss, not enough, prefetch.w want more :-P

18 of 21

Force LR built-in prefetch.w? (No zawrs, more radical!)

RISCV-BOOM:

https://github.com/riscv-boom/riscv-boom/blob/v3.0.0/src/main/scala/lsu/dcache.scala#L650

OpenXiangShan:

https://github.com/OpenXiangShan/XiangShan/blob/v1.0/src/main/scala/xiangshan/cache/MainPipe.scala#L470

Use “lrscCycles - lrscBackoff” to temporarily block the cache snoop channel to give a stronger forward progress guarantee.�(cache-inhibited during LR/SC accesses)

def lrscCycles = 100�def lrscBackoff = 3

def lrscCycles = 80

def lrscBackoff = ?

19 of 21

Is prefetch.w proper for cmpxchg?

The loads in cmpxchg do not make cache line exclusiveness. (Weak compare)
The loads in cmpxchg make cache line exclusiveness. (Strong compare)

A controversial implementation for micro-arch, right?

20 of 21

Conditional atomic with prefetch.w?

static __always_inline int

raw_atomic_fetch_add_unless(atomic_t *v, int a, int u)

{

#if defined(arch_atomic_fetch_add_unless)

return arch_atomic_fetch_add_unless(v, a, u);

#else

int c = raw_atomic_read(v);

do {

if (unlikely(c == u))

break;� prefetchw(v);

} while (!raw_atomic_try_cmpxchg(v, &c, c + a));

return c;

#endif

}

static __always_inline bool

raw_atomic_inc_unless_negative(atomic_t *v)

{

#if defined(arch_atomic_inc_unless_negative)

return arch_atomic_inc_unless_negative(v);

#else

int c = raw_atomic_read(v);

do {

if (unlikely(c < 0))

return false;� prefetchw(v);

} while (!raw_atomic_try_cmpxchg(v, &c, c + 1));

return true;

#endif

}

static __always_inline int

raw_atomic_dec_if_positive(atomic_t *v)

{

#if defined(arch_atomic_dec_if_positive)

return arch_atomic_dec_if_positive(v);

#else

int dec, c = raw_atomic_read(v);

do {

dec = c - 1;

if (unlikely(dec < 0))

break;

prefetchw(v);

} while (!raw_atomic_try_cmpxchg(v, &c, dec));

return dec;

#endif

}

static __always_inline bool

raw_atomic_dec_unless_positive(atomic_t *v)

{

#if defined(arch_atomic_dec_unless_positive)

return arch_atomic_dec_unless_positive(v);

#else

int c = raw_atomic_read(v);

do {

if (unlikely(c > 0))

return false;� prefetchw(v);

} while (!raw_atomic_try_cmpxchg(v, &c, c - 1));

return true;

#endif

}

21 of 21

Thank you