Mobile Graphics,�Vulkan
Arman Papikyan
Lead Render Mobile Programmer at Ubisoft Ukraine
What is Graphics API (history)
What is Graphics API (history)
What is Graphics API (history)
What is Graphics API (history)
What is Graphics API (history)
- 1992
What is Graphics API (history)
- 1992
- 2003
What is Graphics API (history)
- 2016
- 1992
- 2003
9
10
11
12
13
14
15
16
vkBeginCommandBuffer(commandBuffer, &cmdBufInfo)
vkCmdBeginRenderPass(commandBuffer, &renderPassBeginInfo, VK_SUBPASS_CONTENTS_INLINE);
�vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
�vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &descriptorSet, 0, nullptr);
�vkCmdBindVertexBuffers(commandBuffer, 0, 1, &_vertexBuffer, _offsets);
vkCmdBindIndexBuffer(commandBuffer, _indexBuffer, 0, VK_INDEX_TYPE_UINT32);
�vkCmdDrawIndexed(commandBuffer, _index_count, 1, 0, 0, 1);
�vkCmdEndRenderPass(commandBuffer);
VkResult result = vkEndCommandBuffer(commandBuffer);
glClearColor(0.f, 0.f, 0.f, 1.f);
glClear(GL_COLOR_BUFFER_BIT);
�glUseProgram(shaderProgram);
�glActiveTexture(0);
glBindTexture(GL_TEXTURE_2D, tex);
glUniform1i(glGetUniformLocation(shaderProgram, "tex"), 0);
�glBindVertexArray(VAO);
�glDrawElements(GL_TRIANGLES, _index_count, GL_UNSIGNED_INT, 0);
> State Management
vkBeginCommandBuffer(commandBuffer, &cmdBufInfo)
vkCmdBeginRenderPass(commandBuffer, &renderPassBeginInfo, VK_SUBPASS_CONTENTS_INLINE);
glClearColor(0.f, 0.f, 0.f, 1.f);
glClear(GL_COLOR_BUFFER_BIT);
Set up the frame
> State Management
vkBeginCommandBuffer(commandBuffer, &cmdBufInfo)
vkCmdBeginRenderPass(commandBuffer, &renderPassBeginInfo, VK_SUBPASS_CONTENTS_INLINE);
�vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
glClearColor(0.f, 0.f, 0.f, 1.f);
glClear(GL_COLOR_BUFFER_BIT);
�glUseProgram(shaderProgram);
> State Management
Use shader
vkBeginCommandBuffer(commandBuffer, &cmdBufInfo)
vkCmdBeginRenderPass(commandBuffer, &renderPassBeginInfo, VK_SUBPASS_CONTENTS_INLINE);
�vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
�vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &descriptorSet, 0, nullptr);
glClearColor(0.f, 0.f, 0.f, 1.f);
glClear(GL_COLOR_BUFFER_BIT);
�glUseProgram(shaderProgram);
�glActiveTexture(0);
glBindTexture(GL_TEXTURE_2D, tex);
glUniform1i(glGetUniformLocation(shaderProgram, "tex"), 0);
> State Management
Use Texture
vkBeginCommandBuffer(commandBuffer, &cmdBufInfo)
vkCmdBeginRenderPass(commandBuffer, &renderPassBeginInfo, VK_SUBPASS_CONTENTS_INLINE);
�vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
�vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &descriptorSet, 0, nullptr);
�vkCmdBindVertexBuffers(commandBuffer, 0, 1, &_vertexBuffer, _offsets);
vkCmdBindIndexBuffer(commandBuffer, _indexBuffer, 0, VK_INDEX_TYPE_UINT32);
glClearColor(0.f, 0.f, 0.f, 1.f);
glClear(GL_COLOR_BUFFER_BIT);
�glUseProgram(shaderProgram);
�glActiveTexture(0);
glBindTexture(GL_TEXTURE_2D, tex);
glUniform1i(glGetUniformLocation(shaderProgram, "tex"), 0);
�glBindVertexArray(VAO);
> State Management
Use vertex & index buffers
vkBeginCommandBuffer(commandBuffer, &cmdBufInfo)
vkCmdBeginRenderPass(commandBuffer, &renderPassBeginInfo, VK_SUBPASS_CONTENTS_INLINE);
�vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
�vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &descriptorSet, 0, nullptr);
�vkCmdBindVertexBuffers(commandBuffer, 0, 1, &_vertexBuffer, _offsets);
vkCmdBindIndexBuffer(commandBuffer, _indexBuffer, 0, VK_INDEX_TYPE_UINT32);
�vkCmdDrawIndexed(commandBuffer, _index_count, 1, 0, 0, 1);
glClearColor(0.f, 0.f, 0.f, 1.f);
glClear(GL_COLOR_BUFFER_BIT);
�glUseProgram(shaderProgram);
�glActiveTexture(0);
glBindTexture(GL_TEXTURE_2D, tex);
glUniform1i(glGetUniformLocation(shaderProgram, "tex"), 0);
�glBindVertexArray(VAO);
�glDrawElements(GL_TRIANGLES, _index_count, GL_UNSIGNED_INT, 0);
> State Management
Draw call
vkBeginCommandBuffer(commandBuffer, &cmdBufInfo)
vkCmdBeginRenderPass(commandBuffer, &renderPassBeginInfo, VK_SUBPASS_CONTENTS_INLINE);
�vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
�vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &descriptorSet, 0, nullptr);
�vkCmdBindVertexBuffers(commandBuffer, 0, 1, &_vertexBuffer, _offsets);
vkCmdBindIndexBuffer(commandBuffer, _indexBuffer, 0, VK_INDEX_TYPE_UINT32);
�vkCmdDrawIndexed(commandBuffer, _index_count, 1, 0, 0, 1);
�vkCmdEndRenderPass(commandBuffer);
VkResult result = vkEndCommandBuffer(commandBuffer);
glClearColor(0.f, 0.f, 0.f, 1.f);
glClear(GL_COLOR_BUFFER_BIT);
�glUseProgram(shaderProgram);
�glActiveTexture(0);
glBindTexture(GL_TEXTURE_2D, tex);
glUniform1i(glGetUniformLocation(shaderProgram, "tex"), 0);
�glBindVertexArray(VAO);
�glDrawElements(GL_TRIANGLES, _index_count, GL_UNSIGNED_INT, 0);
> State Management
Finish
vkBeginCommandBuffer(commandBuffer, &cmdBufInfo)
vkCmdBeginRenderPass(commandBuffer, &renderPassBeginInfo, VK_SUBPASS_CONTENTS_INLINE);
�vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
�vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &descriptorSet, 0, nullptr);
�vkCmdBindVertexBuffers(commandBuffer, 0, 1, &_vertexBuffer, _offsets);
vkCmdBindIndexBuffer(commandBuffer, _indexBuffer, 0, VK_INDEX_TYPE_UINT32);
�vkCmdDrawIndexed(commandBuffer, _index_count, 1, 0, 0, 1);
�vkCmdEndRenderPass(commandBuffer);
VkResult result = vkEndCommandBuffer(commandBuffer);
glClearColor(0.f, 0.f, 0.f, 1.f);
glClear(GL_COLOR_BUFFER_BIT);
�glUseProgram(shaderProgram);
�glActiveTexture(0);
glBindTexture(GL_TEXTURE_2D, tex);
glUniform1i(glGetUniformLocation(shaderProgram, "tex"), 0);
�glBindVertexArray(VAO);
�glDrawElements(GL_TRIANGLES, _index_count, GL_UNSIGNED_INT, 0);
> State Management
vkBeginCommandBuffer(commandBuffer, &cmdBufInfo)
vkCmdBeginRenderPass(commandBuffer, &renderPassBeginInfo, VK_SUBPASS_CONTENTS_INLINE);
�vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
�vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &descriptorSet, 0, nullptr);
�vkCmdBindVertexBuffers(commandBuffer, 0, 1, &_vertexBuffer, _offsets);
vkCmdBindIndexBuffer(commandBuffer, _indexBuffer, 0, VK_INDEX_TYPE_UINT32);
�vkCmdDrawIndexed(commandBuffer, _index_count, 1, 0, 0, 1);
�vkCmdEndRenderPass(commandBuffer);
VkResult result = vkEndCommandBuffer(commandBuffer);
glClearColor(0.f, 0.f, 0.f, 1.f);
glClear(GL_COLOR_BUFFER_BIT);
�glUseProgram(shaderProgram);
�glActiveTexture(0);
glBindTexture(GL_TEXTURE_2D, tex);
glUniform1i(glGetUniformLocation(shaderProgram, "tex"), 0);
�glBindVertexArray(VAO);
�glDrawElements(GL_TRIANGLES, _index_count, GL_UNSIGNED_INT, 0);
> State Management
vkBeginCommandBuffer(commandBuffer, &cmdBufInfo)
vkCmdBeginRenderPass(commandBuffer, &renderPassBeginInfo, VK_SUBPASS_CONTENTS_INLINE);
�vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
�vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &descriptorSet, 0, nullptr);
�vkCmdBindVertexBuffers(commandBuffer, 0, 1, &_vertexBuffer, _offsets);
vkCmdBindIndexBuffer(commandBuffer, _indexBuffer, 0, VK_INDEX_TYPE_UINT32);
�vkCmdDrawIndexed(commandBuffer, _index_count, 1, 0, 0, 1);
�vkCmdEndRenderPass(commandBuffer);
VkResult result = vkEndCommandBuffer(commandBuffer);
glClearColor(0.f, 0.f, 0.f, 1.f);
glClear(GL_COLOR_BUFFER_BIT);
�glUseProgram(shaderProgram);
�glActiveTexture(0);
glBindTexture(GL_TEXTURE_2D, tex);
glUniform1i(glGetUniformLocation(shaderProgram, "tex"), 0);
�glBindVertexArray(VAO);
�glDrawElements(GL_TRIANGLES, _index_count, GL_UNSIGNED_INT, 0);
> State Management
> Multithreading
> Multithreading
© Mobile Graphics Processing Fundamentals – 2021, ARM webinar
GPU
CPU
CPU
CPU
Shared Cache
System Cache
DRAM
Example S20 Snapdragon
1 x 2.84 GHz 100%
3 x 2.42 GHz 85%
4 x 1.80 GHz 63%
Small Core – Low Frequency
Efficiency
Big Core – High Frequency
Performance
> Multithreading
Power consumption relationship with performance is not linear
Transistor energy per operation ~ Voltage squared
© Mobile Graphics Processing Fundamentals – 2021, ARM webinar
Why is multithreading important?
30
> Multithreading
Nvidia GeForce GTX 1080 Ti
Heap 10.87 GB
DEVICE_LOCAL
AMD Radeon RX Vega 64
Heap 7.75 GB
DEVICE_LOCAL
Heap 256 MB
DEVICE_LOCAL | HOST VISIBLE
Adreno / Mali / PowerVR..
HEAP – 0 MB
*no VRAM
31
© Memory Management in Vulkan – 2018, Jordan Logan, AMD
> Memory Management
Why do you need explicit memory management?
32
> Memory Management
33
Rasterization modes:�
1. Immediate mode
Traditional GPU
2. Tiled mode
Mobile GPU
34
IMMEDIATE MODE
Render each triangle individually
for draw in renderPass:
for primitive in draw:
for vertex in primitive:
execute_vertex_shader(vertex)
if primitive not culled:
for fragment in primitive:
execute_fragment_shader(fragment)
© developer.arm.com/documentation/102662/0100/Immediate-Mode-GPUs
35
IMMEDIATE MODE
Render each triangle individually
for draw in renderPass:
for primitive in draw:
for vertex in primitive:
execute_vertex_shader(vertex)
if primitive not culled:
for fragment in primitive:
execute_fragment_shader(fragment)
© developer.arm.com/documentation/102662/0100/Immediate-Mode-GPUs
36
# Pass one
for draw in renderPass:
for primitive in draw:
for vertex in primitive:
execute_vertex_shader(vertex)
if primitive not culled:
append_tile_list(primitive)
1. Break into bins
TILED MODE
© developer.arm.com/documentation/102662/0100/Tile-based-GPUs
37
# Pass one
for draw in renderPass:
for primitive in draw:
for vertex in primitive:
execute_vertex_shader(vertex)
if primitive not culled:
append_tile_list(primitive)
1. Break into bins
TILED MODE
© developer.arm.com/documentation/102662/0100/Tile-based-GPUs
38
# Pass two
for tile in renderPass:
for primitive in tile:
for fragment in primitive:
execute_fragment_shader(fragment)
2. Render each tile
TILED MODE
© developer.arm.com/documentation/102662/0100/Tile-based-GPUs
39
# Pass one
for draw in renderPass:
for primitive in draw:
for vertex in primitive:
execute_vertex_shader(vertex)
if primitive not culled:
append_tile_list(primitive)
# Pass two
for tile in renderPass:
for primitive in tile:
for fragment in primitive:
execute_fragment_shader(fragment)
1. Break into bins
2. Render each tile
TILED MODE
© developer.arm.com/documentation/102662/0100/Tile-based-GPUs
Memory Bandwidth
40
41
RESOURCE
Texture / Buffer / etc
R
W
R
W
Random Frame
Render scene => Color RT
Color RT => Anti-Alias RT
Anti-Alias RT => Display
RESOURCE
Texture / Buffer / etc
R
W
R
W
Lock
Lock
Random Frame
Render scene => Color RT
Lock
Color RT => Anti-Alias RT
Lock
Anti-Alias RT => Display
> Synchronization
42
> Synchronization
CPU
GPU
43
> Synchronization
CPU - Thread 1
CPU - Thread 2
GPU
44
> Synchronization
Fence: GPU - to - CPU
Semaphor: CPU - to - GPU
Barrier: GPU - to - GPU
Timeline Sempahor: All - to - All
45
> Synchronization
// Three dispatches that don’t have conflicting resource accesses�vkCmdDispatch( 1 );�vkCmdDispatch( 2 );�vkCmdDispatch( 3 );��// 4, 5, and 6 don’t share resources with 1, 2, and 3�// No reason for them to be blocked, so set an event to wait for later�vkCmdSetEvent( A, srcStageMask = COMPUTE );�vkCmdDispatch( 4 );�vkCmdDispatch( 5 );�vkCmdDispatch( 6 );��// 7 and 8 don’t use the same resources as 4, 5, and 6. So use an event�vkCmdSetEvent( B, srcStageMask = COMPUTE );��// 7 and 8 need the results of 1, 2, and 3�// So we’ll wait for them by waiting on A�vkCmdWaitEvents( A, dstStageMask = COMPUTE );�vkCmdDispatch( 7 );�vkCmdDispatch( 8 );
�// 9 uses the same resources as 4, 5, and 6 so we wait.�// Also assumed is that 9 needs nothing from 7 and 8�vkCmdWaitEvents( B, dstStageMask = COMPUTE );
�vkCmdDispatch( 9 );
© khronos.org/blog/understanding-vulkan-synchronization
46
© github.com/KhronosGroup/Vulkan-ValidationLayers/blob/master/docs/synchronization_usage.md
> Synchronization
47
© khronos.org/blog/understanding-vulkan-synchronization
> Synchronization
VK_IMAGE_LAYOUT_UNDEFINED
VK_IMAGE_LAYOUT_GENERAL
VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL
VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL
VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
VK_IMAGE_LAYOUT_PREINITIALIZED
VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL
VK_ACCESS_INDIRECT_COMMAND_READ_BIT
VK_ACCESS_INDEX_READ_BIT
VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT
VK_ACCESS_UNIFORM_READ_BIT
VK_ACCESS_INPUT_ATTACHMENT_READ_BIT
VK_ACCESS_SHADER_READ_BIT
VK_ACCESS_SHADER_WRITE_BIT
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT
VK_ACCESS_TRANSFER_READ_BIT
VK_ACCESS_TRANSFER_WRITE_BIT
VK_ACCESS_HOST_READ_BIT
VK_ACCESS_HOST_WRITE_BIT
VK_ACCESS_MEMORY_READ_BIT
VK_ACCESS_MEMORY_WRITE_BIT
Stage
Layout
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
48
> Synchronization
49
> Synchronization
// Three dispatches that don’t have conflicting resource accesses�vkCmdDispatch( 1 );�vkCmdDispatch( 2 );�vkCmdDispatch( 3 );��// 4, 5, and 6 don’t share resources with 1, 2, and 3�// No reason for them to be blocked, so set an event to wait for later�vkCmdSetEvent( A, srcStageMask = COMPUTE );�vkCmdDispatch( 4 );�vkCmdDispatch( 5 );�vkCmdDispatch( 6 );��// 7 and 8 don’t use the same resources as 4, 5, and 6. So use an event�vkCmdSetEvent( B, srcStageMask = COMPUTE );��// 7 and 8 need the results of 1, 2, and 3�// So we’ll wait for them by waiting on A�vkCmdWaitEvents( A, dstStageMask = COMPUTE );�vkCmdDispatch( 7 );�vkCmdDispatch( 8 );
�// 9 uses the same resources as 4, 5, and 6 so we wait.�// Also assumed is that 9 needs nothing from 7 and 8�vkCmdWaitEvents( B, dstStageMask = COMPUTE );
�vkCmdDispatch( 9 );
© themaister.net/blog/2017/08/15/render-graphs-and-vulkan-a-deep-dive/
1
2
3
7
8
4
5
6
9
50
> Synchronization
© themaister.net/blog/2017/08/15/render-graphs-and-vulkan-a-deep-dive/
1
2
3
7
8
4
5
6
9
51
> Synchronization
- VK_KHR_acceleration_structure
52
> Hardware Raytracing
Summary
© ravbug.com/graphics/
GLES 2.0 2012+
GLES 3.1 2016+
Metal 2014+
Vulkan 2018+
Vulkan -> [MoltenVK] -> Metal
GLES 2.0 2012+ (96.5%)*
GLES 3.1 2016+ (90%)*
Metal 2014+ (up to 50%)*
Vulkan 2018+ (up to 30%)*
*Estimates
GLES 2.0 2012+ (96.5%)*
GLES 3.1 2016+ (90%)*
Metal 2014+ (up to 50%)*
Vulkan 2018+ (up to 30%)*
*Estimates
Vulkan supports all 2018+ mobile devices
Limited
References
59
Thanks for your attention,
Questions?
60
Geometry
Pass
Draw
Material
Pass
Lighting
Pass
Geometry
Pass
Draw
Material + Lighting
Pass
61
© learnopengl.com/Advanced-Lighting/Deferred-Shading
62
© learnopengl.com/Advanced-Lighting/Deferred-Shading
O(num_geometry_fragments * num_lights)
O(screen_resolution * num_lights)