Performance Tuning for CPU
Part 2: Advanced SIMD Optimization
Marat Dukhan
SSE Optimization
Last lecture:
This lecture:
SSE data types
__m128i can hold
__m128 holds four single-precision floats
__m128d hold two double-precision floats
SSE memory operations
uint64_t *aPtr = ...
// Load 2 unsigned 64-bit ints from memory
__m128i a = _mm_loadu_si128(aPtr);
// Store 2 unsigned 64-bit ints to memory
_mm_storeu_si128(aPtr, a);
int8_t *bPtr = ...
// Load 16 signed 8-bit ints from memory
__m128i b = _mm_loadu_si128(bPtr);
// Store 16 signed 8-bit ints to memory
_mm_storeu_si128(bPtr, b);
SSE arithmetic operations
Do operation <op> of type <type> on a and b, write result to c:
__m128i c = _mm_<op>_<type>(a, b);
<type> suffix specifies how to interpret data int __m128:
__m128i c = _mm_min_epi8(a, b);
16 pairwise minimums of 8-bit signed ints
__m128i c = _mm_min_epu16(a, b);
16 pairwise minimums of 16-bit unsigned ints
SSE arithmetic operations
The difference between signed and unsigned:
16-bit integer 0xFFFF = 65535 (unsigned) = -1 (signed)
Non-trivial cases when signed/unsigned matters
__m128i c = _mm_srai_epi16(a, b);
8 pairwise shifts right of 16-bit signed ints by const b
__m128i c = _mm_srli_epi16(a, b);
8 pairwise shifts right of 16-bit unsigned ints by const b
__m128i c = _mm_mulhi_epi16(a, b);
4 pairwise multiplications (high part) of 16-bit signed ints
SSE arithmetic operations
When signed/unsigned interpretation doesn't matter for the operation epiX suffix is used:
__m128i c = _mm_add_epi16(a, b);
8 pairwise sums of 16-bit ints
__m128i c = _mm_sub_epi32(a, b);
4 pairwise differences of 32-bit ints
__m128i c = _mm_slli_epi16(a, b);
8 pairwise shifts left of 16-bit ints in a by the constant b
__m128i c = _mm_mullo_epi32(a, b);
4 pairwise multiplications of 32-bit ints
SSE logical operations
For logical operations even the size of the type does not matter, and they all use si128 suffix:
__m128i c = _mm_and_si128(a, b);
Bitwise AND of 128-bit vectors (c = a & b)
__m128i c = _mm_andnot_si128(a, b);
Bitwise NOT + AND of 128-bit vectors (c = ~a & b)
__m128i c = _mm_or_si128(a, b);
Bitwise OR of 128-bit vectors (c = a | b)
__m128i c = _mm_xor_si128(a, b);
Bitwise XOR of 128-bit vectors (c = a ^ b)
SSE illogical operations
What is illogical about these operations?
SSE illogical operations
What is illogical about these operations?
It took three generations of SSE to add the 32-bit version
Example 1: Array maximum
uint32_t vector_max(
const uint32_t *arrayPointer,
size_t length)
{
uint32_t max = 0;
for (; length != 0; length -= 1) {
// Load array element
const uint32_t element = *arrayPointer;
// Update maximum
max = std::max(max, element);
// Advance pointers to the next element
arrayPointer += 1;
}
return max;
}
uint32_t vector_max(const uint32_t *arrayPointer, size_t length) {
// Initialize the maximum SSE vector
__m128i max_vec = _mm_setzero_si128();
for (; length >= 2; length -= 2) {
// Load two array elements
const __m128i elements =
_mm_loadu_si128((const __m128i*)arrayPointer);
// Update maximum
max_vec = _mm_max_epu32(max_vec, elements);
// Advance pointers to the next element
arrayPointer += 4;
}
uint32_t max = std::max(
std::max(
uint32_t(_mm_cvt_si128_si32(max_vec)),
uint32_t(_mm_extract_epi32(max_vec, 1)),
),
std::max(
uint32_t(_mm_extract_epi32(max_vec, 2)),
uint32_t(_mm_extract_epi32(max_vec, 3))
)
);
// Process the remaining elements of the array (if any)
...
return max;
}
Add SSE vectorization
SSE unpack operations
c = _mm_unpacklo_epi64(a, b);
c = _mm_unpackhi_epi64(a, b);
SSE unpack operations
c = _mm_unpacklo_epi32(a, b);
c = _mm_unpackhi_epi32(a, b);
SSE unpack operations
c = _mm_unpacklo_epi16(a, b);
c = _mm_unpackhi_epi16(a, b);
SSE unpack operations
c = _mm_unpacklo_epi8(a, b);
c = _mm_unpackhi_epi8(a, b);
Example 2: Image Thresholding
Steps:
Deinterleaving RGB pixels with SSE
The scariest picture in this course*
*probably
Deinterleaving RGB pixels with SSE
A simplified example
Deinterleaving RGB pixels with SSE
__m128i layer0_chunk0 = _mm_loadu_si128((__m128i*)source_pixels);
__m128i layer0_chunk1 = _mm_loadu_si128((__m128i*)(source_pixels + 16));
__m128i layer0_chunk2 = _mm_loadu_si128((__m128i*)(source_pixels + 32));
__m128i layer0_chunk3 = _mm_loadu_si128((__m128i*)(source_pixels + 48));
__m128i layer0_chunk4 = _mm_loadu_si128((__m128i*)(source_pixels + 64));
__m128i layer0_chunk5 = _mm_loadu_si128((__m128i*)(source_pixels + 80));
__m128i layer1_chunk0 = _mm_unpacklo_epi8(layer0_chunk0, layer0_chunk3);
__m128i layer1_chunk1 = _mm_unpackhi_epi8(layer0_chunk0, layer0_chunk3);
__m128i layer1_chunk2 = _mm_unpacklo_epi8(layer0_chunk1, layer0_chunk4);
__m128i layer1_chunk3 = _mm_unpackhi_epi8(layer0_chunk1, layer0_chunk4);
__m128i layer1_chunk4 = _mm_unpacklo_epi8(layer0_chunk2, layer0_chunk5);
__m128i layer1_chunk5 = _mm_unpackhi_epi8(layer0_chunk2, layer0_chunk5);
...
__m128i red_chunk0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3);
__m128i red_chunk1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3);
__m128i green_chunk0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4);
__m128i green_chunk1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4);
__m128i blue_chunk0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5);
__m128i blue_chunk1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5);
source_pixels += 96;
SSE shuffle instructions
c = _mm_shuffle_epi32(a, n);
SSE shuffle instructions
a = _mm_shuffle_epi8(b, c);
Deinterleaving RGB pixels with SSE
Deinterleaving RGB pixels with SSE
__m128i ssse3_red_indeces_0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0);
__m128i ssse3_red_indeces_1 = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1);
__m128i ssse3_red_indeces_2 = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m128i ssse3_green_indeces_0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1);
__m128i ssse3_green_indeces_1 = _mm_set_epi8(-1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1);
__m128i ssse3_green_indeces_2 = _mm_set_epi8(14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m128i ssse3_blue_indeces_0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, 11, 8, 5, 2);
__m128i ssse3_blue_indeces_1 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1, -1, -1, -1, -1, -1);
__m128i ssse3_blue_indeces_2 = _mm_set_epi8(15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
const __m128i chunk0 = _mm_loadu_si128((const __m128i*)(source_pixel));
const __m128i chunk1 = _mm_loadu_si128((const __m128i*)(source_pixel + 16));
const __m128i chunk2 = _mm_loadu_si128((const __m128i*)(source_pixel + 32));
source_pixel += 48;
const __m128i red = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, ssse3_red_indeces_0),
_mm_shuffle_epi8(chunk1, ssse3_red_indeces_1)), _mm_shuffle_epi8(chunk2, ssse3_red_indeces_2));
...