Journeying through Optimization with Heuristics
Hi - I’m Jack
I made a programming game
I made a cool hat
That you can play frogger on
I also got really into some maths code
Optimisation Challenges
Quant Finance
The Strategy
What are we optimising?
+ 29.15% oeis [.] mont_mul -> (a*b) (mod p)
+ 28.18% oeis [.] mont_mul_sub -> (a*b) - (c*d) (mod p)
+ 5.13% oeis [.] extended_euclidean -> (1/x) (mod p)
+ 3.62% oeis [.] add_mod_u64 -> (a+b) (mod p)
65% of self time is modular maths primitives
57% is modular multiplication
Modular Multiplication
inline uint64_t mod_mul(uint64_t a, uint64_t b,
uint64_t p, uint64_t p_dash) {
return ((uint128_t)a * (uint128_t)b) % (uint128_t)p;
}
Baseline: 80 seconds with 64 bit division
Montgomery Multiplication
// p_dash = inverse of -1
uint64_t mont_mul(uint64_t a, uint64_t b,
uint64_t p, uint64_t p_dash) {
uint128_t t = (uint128_t)a * b;
uint64_t m = (uint64_t)t * p_dash;
uint128_t u = t + (uint128_t)m * p;
uint64_t res = u >> 64;
if (res >= p) res -= p;
return res;
}
51s run time
https://en.wikipedia.org/wiki/Montgomery_modular_multiplication
Heuristic - Run time of small test case
Calculating the 9th number in the sequence is a nice size test case
Heuristic - Perf record
Heuristic - Division is slow
How to Divide
Euler
VS
Euclid�
Euler's theorem
Extended Euclidean Algorithm
Euclid > Euler!
Euler > Euclid!
Wait, what?
Digging in…
The main difference between these two methods is that one is doing a bunch of multiplication, and the other is doing a bunch of division
Jack’s Laptop (circa 2023) is on a Zen 3 architecture
Heap (circa 2015) is on a broadwell architecture
Multiplies (thank you Agner Fog!)
Zen 3:
Haswell:
Divides (thank you Agner Fog!)
Zen 3:
Haswell:
The solution
Caching
Caching
Heuristic - Keep things small for cache
Lets look at our function!
cmp
Compares the first source operand with the second source operand and sets the status flags in the EFLAGS register according to the results. The comparison is performed by subtracting the second operand from the first operand and then setting the status flags in the same manner as the SUB instruction.
First Optimisation!
Before:
After:
Stop Press - can get it from “C”
One other good idea from the Maths!
uint64_t mont_mul(uint64_t a, uint64_t b,
uint64_t p, uint64_t p_dash) {
uint128_t t = (uint128_t)a * b;
uint64_t m = (uint64_t)t * p_dash;
uint128_t u = t + (uint128_t)m * p;
uint64_t res = u >> 64;
The lower bits of u are always zero
Couldn’t get GCC to do it
Not live demo
Stack Corruption
static inline uint64_t mont_mul(uint64_t a, uint64_t b, uint64_t p, uint64_t p_dash) {
asm("movq %rdi, %rax\n"
"movq %rdx, %r8\n"
"mulq %rsi\n"
"imulq %rax, %rcx\n"
"movq %rax, %rsi\n"
"movq %rdx, %rdi\n"
"movq %rcx, %rax\n"
"mulq %r8\n"
"addq %rsi, %rax\n"
"adcq %rdi, %rdx\n"
"movq %rdx, %rax\n"
"subq %r8, %rax\n"
"cmovs %rdx, %rax\n"
"ret\n");
}
make: *** [Makefile:132: test] Error 139
Non obvious syntax error
inline uint64_t mont_mul(uint64_t a, uint64_t b, uint64_t p, uint64_t p_dash) {
uint64_t ret;
asm("movq %rdi, %rax\n"
"movq %rdx, %r8\n"
"mulq %rsi\n"
"imulq %rax, %rcx\n"
"movq %rax, %rsi\n"
"movq %rdx, %rdi\n"
"movq %rcx, %rax\n"
"mulq %r8\n"
"addq %rsi, %rax\n"
"adcq %rdi, %rdx\n"
"movq %rdx, %rax\n"
"subq %r8, %rax\n"
"cmovs %rdx, %rax"
: "=a"(ret));
return ret;
}
include/maths.h:143:3: error: invalid 'asm': operand number missing after %-letter
Register Corruption
inline uint64_t mont_mul(uint64_t a, uint64_t b, uint64_t p, uint64_t p_dash) {
uint64_t ret;
asm("movq %%rdi, %%rax\n"
"movq %%rdx, %%r8\n"
"mulq %%rsi\n"
"imulq %%rax, %%rcx\n"
"movq %%rax, %%rsi\n"
"movq %%rdx, %%rdi\n"
"movq %%rcx, %%rax\n"
"mulq %%r8\n"
"addq %%rsi, %%rax\n"
"adcq %%rdi, %%rdx\n"
"movq %%rdx, %%rax\n"
"subq %%r8, %%rax\n"
"cmovs %%rdx, %%rax\n"
: "=a"(ret));
return ret;
}
make: *** [Makefile:132: test] Error 139
Non obvious syntax error
inline uint64_t mont_mul(uint64_t a, uint64_t b, uint64_t p, uint64_t p_dash) {
uint64_t ret;
asm("movq %%rdi, %%rax\n"
"movq %%rdx, %%r8\n"
"mulq %%rsi\n"
"imulq %%rax, %%rcx\n"
"movq %%rax, %%rsi\n"
"movq %%rdx, %%rdi\n"
"movq %%rcx, %%rax\n"
"mulq %%r8\n"
"addq %%rsi, %%rax\n"
"adcq %%rdi, %%rdx\n"
"movq %%rdx, %%rax\n"
"subq %%r8, %%rax\n"
"cmovs %%rdx, %%rax\n"
: "=a"(ret)
:
: "rax", "rdi", "rdx", "r8", "rsi", "rcx");
return ret;
}
include/maths.h:143:3: error: ‘asm’ operand has impossible constraints
Using Junk data as input
inline uint64_t mont_mul(uint64_t a, uint64_t b, uint64_t p, uint64_t p_dash) {
uint64_t ret;
asm("movq %%rdi, %%rax\n"
"movq %%rdx, %%r8\n"
"mulq %%rsi\n"
"imulq %%rax, %%rcx\n"
"movq %%rax, %%rsi\n"
"movq %%rdx, %%rdi\n"
"movq %%rcx, %%rax\n"
"mulq %%r8\n"
"addq %%rsi, %%rax\n"
"adcq %%rdi, %%rdx\n"
"movq %%rdx, %%rax\n"
"subq %%r8, %%rax\n"
"cmovs %%rdx, %%rax\n"
: "=a"(ret)
:
: "rdi", "rdx", "r8", "rsi", "rcx");
return ret;
}
❌ n=1 (k=3): expected 2, got 340566958 [0.007s]
Register Corruption
inline uint64_t mont_mul(uint64_t a, uint64_t b, uint64_t p, uint64_t p_dash) {
uint64_t ret;
asm("movq %%rdi, %%rax\n"
"movq %%rdx, %%r8\n"
"mulq %%rsi\n"
"imulq %%rax, %%rcx\n"
"movq %%rax, %%rsi\n"
"movq %%rdx, %%rdi\n"
"movq %%rcx, %%rax\n"
"mulq %%r8\n"
"addq %%rsi, %%rax\n"
"adcq %%rdi, %%rdx\n"
"movq %%rdx, %%rax\n"
"subq %%r8, %%rax\n"
"cmovs %%rdx, %%rax\n"
: "=a"(ret)
: "D"(a), "S"(b), "d"(p), "c"(p_dash)
: "r8");
return ret;
}
Test hangs forever
Success! But unnecessary instruction
inline uint64_t mont_mul(uint64_t a, uint64_t b, uint64_t p, uint64_t p_dash) {
uint64_t ret;
asm("movq %%rdi, %%rax\n"
"movq %%rdx, %%r8\n"
"mulq %%rsi\n"
"imulq %%rax, %%rcx\n"
"movq %%rax, %%rsi\n"
"movq %%rdx, %%rdi\n"
"movq %%rcx, %%rax\n"
"mulq %%r8\n"
"addq %%rsi, %%rax\n"
"adcq %%rdi, %%rdx\n"
"movq %%rdx, %%rax\n"
"subq %%r8, %%rax\n"
"cmovs %%rdx, %%rax\n"
: "=a"(ret), "+D"(a), "+S"(b), "+d"(p), "+c"(p_dash)
:
: "r8");
return ret;
}
✅ n=1 OK [0.005s]
Register Corruption
inline uint64_t mont_mul(uint64_t a, uint64_t b, uint64_t p, uint64_t p_dash) {
uint64_t ret;
asm("movq %%rdx, %%r8\n"
"mulq %%rsi\n"
"imulq %%rax, %%rcx\n"
"movq %%rax, %%rsi\n"
"movq %%rdx, %%rdi\n"
"movq %%rcx, %%rax\n"
"mulq %%r8\n"
"addq %%rsi, %%rax\n"
"adcq %%rdi, %%rdx\n"
"movq %%rdx, %%rax\n"
"subq %%r8, %%rax\n"
"cmovs %%rdx, %%rax\n"
: "=a"(ret), "+S"(b), "+d"(p), "+c"(p_dash)
: "a"(a)
: "r8");
return ret;
}
make: *** [Makefile:132: test] Error 139
The feature you want doesn’t exist
inline uint64_t mont_mul(uint64_t a, uint64_t b, uint64_t p, uint64_t p_dash) {
uint64_t ret;
asm("mulq %%rsi\n"
"imulq %%rax, %%rcx\n"
"movq %%rax, %%rsi\n"
"movq %%rdx, %%rdi\n"
"movq %%rcx, %%rax\n"
"mulq %%r8\n"
"addq %%rsi, %%rax\n"
"adcq %%rdi, %%rdx\n"
"movq %%rdx, %%rax\n"
"subq %%r8, %%rax\n"
"cmovs %%rdx, %%rax\n"
: "=a"(ret), "+S"(b), "+c"(p_dash)
: "a"(a), "r8"(p)
: "r8", "rdi", "rdx");
return ret;
}
include/maths.h:157:7: error: matching constraint references invalid operand number
Success! - actually Undefined Behaviour
inline uint64_t mont_mul(uint64_t a, uint64_t b, uint64_t p, uint64_t p_dash) {
uint64_t ret;
asm("mulq %%rsi\n"
"imulq %%rax, %%rcx\n"
"movq %%rax, %%rsi\n"
"movq %%rdx, %%rdi\n"
"movq %%rcx, %%rax\n"
"mulq %[p_reg]\n"
"addq %%rsi, %%rax\n"
"adcq %%rdi, %%rdx\n"
"movq %%rdx, %%rax\n"
"subq %[p_reg], %%rax\n"
"cmovs %%rdx, %%rax\n"
: "=a"(ret), "+S"(b), "+c"(p_dash)
: "a"(a), [p_reg] "r"(p)
: "rdi", "rdx");
return ret;
}
✅ n=1 OK [0.005s]
I _think_ this is correct now
inline uint64_t mont_mul(uint64_t a, uint64_t b, uint64_t p, uint64_t p_dash) {
uint64_t ret;
asm("mulq %%rsi\n"
"imulq %%rax, %%rcx\n"
"movq %%rax, %%rsi\n"
"movq %%rdx, %%rdi\n"
"movq %%rcx, %%rax\n"
"mulq %[p_reg]\n"
"addq %%rsi, %%rax\n"
"adcq %%rdi, %%rdx\n"
"movq %%rdx, %%rax\n"
"subq %[p_reg], %%rax\n"
"cmovs %%rdx, %%rax\n"
: "=a"(ret), "+S"(b), "+c"(p_dash)
: "a"(a), [p_reg] "r"(p)
: "rdi", "rdx", "cc");
return ret;
}
✅ n=1 OK [0.005s]
Works or not depending on where inlined
inline uint64_t mont_mul(uint64_t a, uint64_t b, uint64_t p, uint64_t p_dash) {
uint64_t result;
uint64_t scratch;
asm("mulq %[b]\n"
"movq %%rdx, %[scratch]\n"
"imulq %[p_dash], %%rax\n"
"mulq %[p]\n"
"addq $-1, %%rax\n"
"adcq %[scratch], %%rdx\n"
"movq %%rdx, %%rax\n"
"subq %[p], %%rax\n"
"cmovcq %%rdx, %%rax\n"
: "=a"(result)
: [a] "0"(a), [b] "r"(b), [p] "r"(p), [p_dash] "r"(p_dash),
[scratch] "r"(scratch)
: "rdx", "cc");
return result;
}
✅ n=3 OK [0.006s]
❌ n=3 (--jack both) (k=7): expected 1015440, got 4512050826672256204 [0.009s]
I finally think this is good
inline uint64_t mont_mul(uint64_t a, uint64_t b, uint64_t p, uint64_t p_dash) {
uint64_t result;
uint64_t scratch;
asm("mulq %[b]\n"
"movq %%rdx, %[scratch]\n"
"imulq %[p_dash], %%rax\n"
"mulq %[p]\n"
"addq $-1, %%rax\n"
"adcq %[scratch], %%rdx\n"
"movq %%rdx, %%rax\n"
"subq %[p], %%rax\n"
"cmovcq %%rdx, %%rax\n"
: "=&a"(result), [scratch] "=&r"(scratch)
: [a] "0"(a), [b] "r"(b), [p] "r"(p), [p_dash] "r"(p_dash)
: "rdx", "cc");
return result;
}
✅ n=3 OK [0.006s]
✅ n=3 (--jack both) OK [0.010s]
How does the perf compare?
With C:
2564193830794 cycles:u
With ASM:
2514090657968 cycles:u
About 2% faster
Heuristic - Perf stat - cycles
Ooops - things I found when writing the slides
Actually… clang can generate something just as short
C vs ASM using Clang
(other perf improvements too)
Optimized C:
Inline ASM
Wait… that’s not supposed to happen…
Heuristic - Less instructions is better
Matt Godbolt: Advanced Skylake Deep Dive
https://www.youtube.com/watch?v=BVVNtG5dgks
Heuristic - LLVM MCA
What’s going on here??
Are the LLVM params correct?
defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>;
// Integer 64-bit multiplication.
defm : Zn3WriteResIntPair<WriteMULX64, [Zn3Multiplier], 3, [1], 2>;
// Integer 32-bit Unsigned Multiply Without Affecting Flags.
Results with Modded Schedule
optimised_c.c
Iterations: 100
Instructions: 800
Total Cycles: 1305
Total uOps: 1000
Dispatch Width: 6
uOps Per Cycle: 0.77
IPC: 0.61
Block RThroughput: 3.0
inline_asm.c
Iterations: 100
Instructions: 900
Total Cycles: 1303
Total uOps: 1100
Dispatch Width: 6
uOps Per Cycle: 0.84
IPC: 0.69
Block RThroughput: 3.0
Results with Modded Schedule
optimised_c.c
[0,0] DeeeeER . . . mulx rax, rdx, rdi
[0,1] D====eeeER. . . imul rdx, rcx
[0,7] .D=============eER . cmovs rax, rdx
Inline_asm.c
[0,0] DeeeER . . . mul rsi
[0,1] D----R . . . mov rdi, rdx
[0,2] D===eeeER . . . imul rax, rcx
[0,8] .D===========eER . cmovb rax, rdx
Code layout noise
Making decisions in uncertainty
Theory 1 the ASM isn’t faster
Theory 2 the ASM is faster
But…
The Heuristics
The Moral
Questions?
Shout outs
Shout out: