Optimize Animated Model rendering with hardware instancing

All models are using the same global buffer for skinned bones which allows to share shader binding for instancing.
Refactor draw call for batching skinned mesh draws.
Remove `SkinnedMeshDrawData` and merge it into `AnimatedModel` internals.
This commit is contained in:
2026-06-15 17:59:41 +02:00
parent 4d1c627900
commit bc36168318
26 changed files with 430 additions and 367 deletions
+249 -80
View File
@@ -22,71 +22,165 @@
#include "Engine/Level/Scene/Scene.h"
#include "Engine/Level/SceneObjectsFactory.h"
#include "Engine/Profiler/Profiler.h"
#include "Engine/Profiler/ProfilerMemory.h"
#include "Engine/Serialization/Serialization.h"
// Implements efficient skinning data update within a shared GPUMemoryPass with manual resource transitions batched for all animated models.
// Implements efficient skinning data update within a shared GPUBuffer with memory sharing for all animated models.
class AnimatedModelRenderListExtension : public RenderList::IExtension
{
public:
struct Item
// Allocation within the global buffer (offset and size are in bytes)
struct Allocation
{
GPUBuffer* BoneMatrices;
void* Data;
int32 Size;
uint32 Size;
uint32 Offset;
};
RenderListBuffer<Item> Items;
GPUBuffer* GlobalBuffer = nullptr;
RenderListBuffer<Allocation> Updates;
CriticalSection Locker;
Array<Allocation> FreeList;
Array<byte> Data;
uint32 CurrentOffset = 0;
uint32 CurrentSize = 0;
uint32 ReallocateSize = 0; // Lower bound for the new buffer size to copy back from old buffer (in bytes)
volatile int64 UpdateSize = 0;
ReadWriteLock DataLocker; // Ensure to lock data writers when performing reallocation of the global buffer
void PreDraw(GPUContext* context, RenderContextBatch& renderContextBatch) override
// Allocates a new skinned bones data block from the global buffer and returns its offset and size (in bytes).
Allocation Allocate(uint32 size)
{
Items.Clear();
Allocation result;
PROFILE_MEM(Animations);
ScopeLock lock(Updates.Locker());
// Check free items list to reuse allocation
auto* freeItems = FreeList.Get();
for (int32 i = 0; i < FreeList.Count(); i++)
{
if (freeItems[i].Size == size)
{
result = freeItems[i];
FreeList.RemoveAt(i);
return result;
}
}
// Check if need to create/resize the global buffer
if (CurrentOffset + size > CurrentSize)
{
DataLocker.WriteLock(); // Ensure none if writing to this buffer during resize
// First allocation sets it (in case multiple reallocs before draw)
if (ReallocateSize == 0)
ReallocateSize = CurrentSize;
// Grow buffer
CurrentSize = CurrentSize == 0 ? 16 * 1024 : CurrentSize * 2;
ASSERT(CurrentOffset + size <= CurrentSize);
Data.Resize(CurrentSize, true);
DataLocker.WriteUnlock();
}
// Allocate new block
result = { size, CurrentOffset };
CurrentOffset += size;
return result;
}
// Frees allocated memory back to the global buffer.
void Free(Allocation alloc)
{
PROFILE_MEM(Animations);
ScopeLock lock(Updates.Locker());
FreeList.Add(alloc);
// TODO: track active allocations count and roll back to offset 0 without free list when all allocations are freed to reduce fragmentation (eg. on scene changing)
}
private:
GPUBuffer* InitBuffer() const
{
GPUBuffer* buffer = GPUDevice::Instance->CreateBuffer(TEXT("BoneMatrices"));
if (buffer->Init(GPUBufferDescription::Typed((int32)(CurrentSize / sizeof(Float4)), PixelFormat::R32G32B32A32_Float, false, GPUResourceUsage::Dynamic)))
{
LOG(Error, "Failed to initialize the skinned mesh bones buffer");
SAFE_DELETE_GPU_RESOURCE(buffer);
}
return buffer;
}
public:
// [RenderList::IExtension]
void Dispose() override
{
// Free memory
Updates.Clear();
FreeList.Clear();
CurrentOffset = 0;
CurrentSize = 0;
ReallocateSize = 0;
SAFE_DELETE_GPU_RESOURCE(GlobalBuffer);
}
void PreDraw(GPUContext* context, RenderContextBatch& renderContextBatch) override
{
// Free pending updates to collect the during drawing
Updates.Clear();
UpdateSize = 0;
// Setup global buffer on GPU
if (!CurrentSize)
return;
if (!GlobalBuffer)
{
GlobalBuffer = InitBuffer();
ReallocateSize = 0;
}
else if (ReallocateSize)
{
auto newGlobalBuffer = InitBuffer();
context->CopyBuffer(newGlobalBuffer, GlobalBuffer, ReallocateSize);
GlobalBuffer->DeleteObject(1.0f); // Delay destruction
GlobalBuffer = newGlobalBuffer;
ReallocateSize = 0;
}
}
void PostDraw(GPUContext* context, RenderContextBatch& renderContextBatch) override
{
const int32 count = Items.Count();
const int32 count = Updates.Count();
if (count == 0)
return;
PROFILE_GPU_CPU_NAMED("Update Bones");
GPUMemoryPass pass(context);
Item* items = Items.Get();
ScopeWriteLock lock(DataLocker);
// Special case for D3D11 backend that doesn't need transitions
if (context->GetDevice()->GetRendererType() <= RendererType::DirectX11)
auto* updates = Updates.Get();
auto globalBuffer = GlobalBuffer;
auto globalData = Data.Get();
if (context->GetDevice()->GetRendererType() <= RendererType::DirectX11 || // Dynamic buffer cannot be updated partially on D3D11 (hence D3D11_MAP_WRITE_DISCARD), so update the whole buffer
count >= 1000 || // When updates count is large, it is more efficient to update the whole buffer at once
UpdateSize >= (uint32)(0.7f * CurrentOffset)) // When modified size is large, it is more efficient to update the whole buffer at once
{
for (int32 i = 0; i < count; i++)
{
Item& item = items[i];
context->UpdateBuffer(item.BoneMatrices, item.Data, item.Size);
}
// Update whole buffer at once
context->UpdateBuffer(globalBuffer, globalData, CurrentOffset);
}
else
{
// Batch resource barriers for buffer update
for (int32 i = 0; i < count; i++)
pass.Transition(items[i].BoneMatrices, GPUResourceAccess::CopyWrite);
// Update all buffers within Memory Pass (no barriers between)
// Update all modified chunks of the buffer
for (int32 i = 0; i < count; i++)
{
Item& item = items[i];
context->UpdateBuffer(item.BoneMatrices, item.Data, item.Size);
auto& item = updates[i];
context->UpdateBuffer(globalBuffer, globalData + item.Offset, item.Size, item.Offset);
}
// Batch resource barriers for reading in Vertex Shader
for (int32 i = 0; i < count; i++)
pass.Transition(items[i].BoneMatrices, GPUResourceAccess::ShaderReadGraphics);
pass.Transition(globalBuffer, GPUResourceAccess::ShaderReadGraphics);
}
#if COMPILE_WITH_PROFILER
// Insert amount of kilobytes of data updated into profiler trace
uint32 dataSize = 0;
for (int32 i = 0; i < count; i++)
dataSize += items[i].Size;
ZoneValue(dataSize / 1024);
ZoneValue(UpdateSize / 1024); // Trace amount of kilobytes of data updated
#endif
Items.Clear();
Updates.Clear();
UpdateSize = 0;
}
};
@@ -107,6 +201,94 @@ AnimatedModel::AnimatedModel(const SpawnParams& params)
_sphere = BoundingSphere(Vector3::Zero, 0.0f);
}
AnimatedModel::SkinnedBones::SkinnedBones()
{
static_assert(sizeof(*this) == sizeof(uint64), "Update size/alignment.");
*(uint64*)this = 0;
}
AnimatedModel::SkinnedBones::~SkinnedBones()
{
if (IsAllocated)
{
uint32 dataSize = BonesCount * sizeof(Matrix3x4) * (HasPrevBones ? 2 : 1);
RenderListExtension.Free({ dataSize, GlobalBufferOffset });
}
}
void AnimatedModel::SkinnedBones::Update(const SkeletonData& skeleton, const Array<Matrix>& nodesPose, bool perBoneMotionBlur, bool reset)
{
const int32 bonesCount = skeleton.Bones.Count();
// Swap between two halves of the buffer for current/previous frame bones
if (HasPrevBones)
{
IsPrevBones = !IsPrevBones;
}
// Lazy-allocate from global buffer (double the size when using prev frame bones for per-bone motion vectors)
if (!IsAllocated || BonesCount != bonesCount || HasPrevBones != perBoneMotionBlur)
{
if (IsAllocated)
{
uint32 dataSize = BonesCount * sizeof(Matrix3x4) * (HasPrevBones ? 2 : 1);
RenderListExtension.Free({ dataSize, GlobalBufferOffset });
}
uint32 dataSize = bonesCount * sizeof(Matrix3x4) * (perBoneMotionBlur ? 2 : 1);
auto alloc = RenderListExtension.Allocate(dataSize);
GlobalBufferOffset = alloc.Offset;
BonesCount = bonesCount;
IsAllocated = true;
IsPrevBones = false;
IsPrevFlushed = false;
HasPrevBones = perBoneMotionBlur;
}
else if (reset)
{
IsPrevBones = false;
IsPrevFlushed = false;
}
// Copy bones transformations to the CPU buffer (including bone offset matrix) and mark it as dirty to be flushed with GPU buffer later
RenderListExtension.DataLocker.ReadLock();
const SkeletonBone* bones = skeleton.Bones.Get();
const Matrix* nodes = nodesPose.Get();
Matrix3x4* output = (Matrix3x4*)(RenderListExtension.Data.Get() + GlobalBufferOffset); // DataLocker ensures it's safe to access (resizing happens within exclusive write-lock)
if (IsPrevBones)
output += BonesCount; // Write to the second half of the allocation
ASSERT(nodesPose.Count() == skeleton.Nodes.Count());
for (int32 boneIndex = 0; boneIndex < bonesCount; boneIndex++)
{
const SkeletonBone& bone = bones[boneIndex];
Matrix matrix;
Matrix::Multiply(bone.OffsetMatrix, nodes[bone.NodeIndex], matrix);
output[boneIndex].SetMatrixTranspose(matrix);
}
RenderListExtension.DataLocker.ReadUnlock();
IsDirty = true;
}
void AnimatedModel::SkinnedBones::Flush()
{
uint32 size = BonesCount * sizeof(Matrix3x4);
uint32 offset = GlobalBufferOffset;
if (IsPrevBones)
{
// Write to the second half of the allocation (1st half will contain previous frame bones)
offset += size;
// Mark initial flush of the previous frame bones
IsPrevFlushed = true;
}
// Add pending buffer update
RenderListExtension.Updates.Add({ size, offset });
Platform::InterlockedAdd(&RenderListExtension.UpdateSize, size);
// Clear dirty flag
IsDirty = false;
}
AnimatedModel::~AnimatedModel()
{
if (_deformation)
@@ -139,15 +321,6 @@ void AnimatedModel::UpdateAnimation()
void AnimatedModel::SetupSkinningData()
{
ASSERT(SkinnedModel && SkinnedModel->IsLoaded());
const int32 targetBonesCount = SkinnedModel->Skeleton.Bones.Count();
const int32 currentBonesCount = _skinningData.BonesCount;
if (targetBonesCount != currentBonesCount)
{
_skinningData.Setup(targetBonesCount);
}
}
void AnimatedModel::PreInitSkinningData()
@@ -158,7 +331,6 @@ void AnimatedModel::PreInitSkinningData()
PROFILE_MEM(Animations);
ScopeLock lock(SkinnedModel->Locker);
SetupSkinningData();
auto& skeleton = SkinnedModel->Skeleton;
const int32 bonesCount = skeleton.Bones.Count();
const int32 nodesCount = skeleton.Nodes.Count();
@@ -180,15 +352,7 @@ void AnimatedModel::PreInitSkinningData()
GraphInstance.RootTransform = nodesCount > 0 ? skeleton.Nodes[0].LocalTransform : Transform::Identity;
// Setup bones transformations including bone offset matrix
Matrix3x4* output = (Matrix3x4*)_skinningData.Data.Get();
const SkeletonBone* bones = skeleton.Bones.Get();
for (int32 boneIndex = 0; boneIndex < bonesCount; boneIndex++)
{
auto& bone = bones[boneIndex];
Matrix identityMatrix = bone.OffsetMatrix * nodesPose[bone.NodeIndex];
output[boneIndex].SetMatrixTranspose(identityMatrix);
}
_skinningData.OnDataChanged(true);
_bones.Update(skeleton, GraphInstance.NodesPose, PerBoneMotionBlur, true);
UpdateBounds();
UpdateSockets();
@@ -926,18 +1090,7 @@ void AnimatedModel::OnAnimationUpdated_Async()
// Calculate the final bones transformations and update skinning
{
ANIM_GRAPH_PROFILE_EVENT("Final Pose");
const int32 bonesCount = skeleton.Bones.Count();
Matrix3x4* output = (Matrix3x4*)_skinningData.Data.Get();
ASSERT(GraphInstance.NodesPose.Count() == skeleton.Nodes.Count());
ASSERT(_skinningData.Data.Count() == bonesCount * sizeof(Matrix3x4));
for (int32 boneIndex = 0; boneIndex < bonesCount; boneIndex++)
{
const SkeletonBone& bone = skeleton.Bones[boneIndex];
Matrix matrix;
Matrix::Multiply(bone.OffsetMatrix, GraphInstance.NodesPose.Get()[bone.NodeIndex], matrix);
output[boneIndex].SetMatrixTranspose(matrix);
}
_skinningData.OnDataChanged(!PerBoneMotionBlur);
_bones.Update(skeleton, GraphInstance.NodesPose, PerBoneMotionBlur);
}
//if (UpdateWhenOffscreen)
@@ -1077,18 +1230,26 @@ void AnimatedModel::Draw(RenderContext& renderContext)
GEOMETRY_DRAW_STATE_EVENT_BEGIN(_drawState, world);
_lastMinDstSqr = Math::Min(_lastMinDstSqr, Vector3::DistanceSquared(_transform.Translation, renderContext.View.WorldPosition));
if (_skinningData.IsReady())
if (_bones.IsAllocated)
{
// Flush skinning data with GPU
if (_skinningData.IsDirty())
{
RenderListExtension.Items.Add({ _skinningData.BoneMatrices, _skinningData.Data.Get(), _skinningData.Data.Count() });
_skinningData.OnFlush();
}
if (_bones.IsDirty)
_bones.Flush();
SkinnedMesh::DrawInfo draw;
draw.Buffer = &Entries;
draw.Skinning = &_skinningData;
draw.SkinningBones = RenderListExtension.GlobalBuffer;
draw.SkinningBonesOffset = _bones.GlobalBufferOffset / sizeof(Matrix3x4);
draw.WithPrevBones = _bones.HasPrevBones && _bones.IsPrevFlushed;
if (draw.WithPrevBones)
{
draw.PrevBonesOffset = _bones.BonesCount;
if (_bones.IsPrevBones)
{
draw.SkinningBonesOffset += draw.PrevBonesOffset;
draw.PrevBonesOffset = -draw.PrevBonesOffset;
}
}
draw.World = &world;
draw.DrawState = &_drawState;
draw.Deformation = _deformation;
@@ -1120,18 +1281,26 @@ void AnimatedModel::Draw(RenderContextBatch& renderContextBatch)
GEOMETRY_DRAW_STATE_EVENT_BEGIN(_drawState, world);
_lastMinDstSqr = Math::Min(_lastMinDstSqr, Vector3::DistanceSquared(_transform.Translation, renderContext.View.WorldPosition));
if (_skinningData.IsReady())
if (_bones.IsAllocated)
{
// Flush skinning data with GPU
if (_skinningData.IsDirty())
{
RenderListExtension.Items.Add({ _skinningData.BoneMatrices, _skinningData.Data.Get(), _skinningData.Data.Count() });
_skinningData.OnFlush();
}
if (_bones.IsDirty)
_bones.Flush();
SkinnedMesh::DrawInfo draw;
draw.Buffer = &Entries;
draw.Skinning = &_skinningData;
draw.SkinningBones = RenderListExtension.GlobalBuffer;
draw.SkinningBonesOffset = _bones.GlobalBufferOffset / sizeof(Matrix3x4);
draw.WithPrevBones = _bones.HasPrevBones && _bones.IsPrevFlushed;
if (draw.WithPrevBones)
{
draw.PrevBonesOffset = _bones.BonesCount;
if (_bones.IsPrevBones)
{
draw.SkinningBonesOffset += draw.PrevBonesOffset;
draw.PrevBonesOffset = -draw.PrevBonesOffset;
}
}
draw.World = &world;
draw.DrawState = &_drawState;
draw.Deformation = _deformation;