Optimize Animated Model rendering with hardware instancing
All models are using the same global buffer for skinned bones which allows to share shader binding for instancing. Refactor draw call for batching skinned mesh draws. Remove `SkinnedMeshDrawData` and merge it into `AnimatedModel` internals.
This commit is contained in:
@@ -22,71 +22,165 @@
|
||||
#include "Engine/Level/Scene/Scene.h"
|
||||
#include "Engine/Level/SceneObjectsFactory.h"
|
||||
#include "Engine/Profiler/Profiler.h"
|
||||
#include "Engine/Profiler/ProfilerMemory.h"
|
||||
#include "Engine/Serialization/Serialization.h"
|
||||
|
||||
// Implements efficient skinning data update within a shared GPUMemoryPass with manual resource transitions batched for all animated models.
|
||||
// Implements efficient skinning data update within a shared GPUBuffer with memory sharing for all animated models.
|
||||
class AnimatedModelRenderListExtension : public RenderList::IExtension
|
||||
{
|
||||
public:
|
||||
struct Item
|
||||
// Allocation within the global buffer (offset and size are in bytes)
|
||||
struct Allocation
|
||||
{
|
||||
GPUBuffer* BoneMatrices;
|
||||
void* Data;
|
||||
int32 Size;
|
||||
uint32 Size;
|
||||
uint32 Offset;
|
||||
};
|
||||
|
||||
RenderListBuffer<Item> Items;
|
||||
GPUBuffer* GlobalBuffer = nullptr;
|
||||
RenderListBuffer<Allocation> Updates;
|
||||
CriticalSection Locker;
|
||||
Array<Allocation> FreeList;
|
||||
Array<byte> Data;
|
||||
uint32 CurrentOffset = 0;
|
||||
uint32 CurrentSize = 0;
|
||||
uint32 ReallocateSize = 0; // Lower bound for the new buffer size to copy back from old buffer (in bytes)
|
||||
volatile int64 UpdateSize = 0;
|
||||
ReadWriteLock DataLocker; // Ensure to lock data writers when performing reallocation of the global buffer
|
||||
|
||||
void PreDraw(GPUContext* context, RenderContextBatch& renderContextBatch) override
|
||||
// Allocates a new skinned bones data block from the global buffer and returns its offset and size (in bytes).
|
||||
Allocation Allocate(uint32 size)
|
||||
{
|
||||
Items.Clear();
|
||||
Allocation result;
|
||||
PROFILE_MEM(Animations);
|
||||
ScopeLock lock(Updates.Locker());
|
||||
|
||||
// Check free items list to reuse allocation
|
||||
auto* freeItems = FreeList.Get();
|
||||
for (int32 i = 0; i < FreeList.Count(); i++)
|
||||
{
|
||||
if (freeItems[i].Size == size)
|
||||
{
|
||||
result = freeItems[i];
|
||||
FreeList.RemoveAt(i);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if need to create/resize the global buffer
|
||||
if (CurrentOffset + size > CurrentSize)
|
||||
{
|
||||
DataLocker.WriteLock(); // Ensure none if writing to this buffer during resize
|
||||
|
||||
// First allocation sets it (in case multiple reallocs before draw)
|
||||
if (ReallocateSize == 0)
|
||||
ReallocateSize = CurrentSize;
|
||||
|
||||
// Grow buffer
|
||||
CurrentSize = CurrentSize == 0 ? 16 * 1024 : CurrentSize * 2;
|
||||
ASSERT(CurrentOffset + size <= CurrentSize);
|
||||
Data.Resize(CurrentSize, true);
|
||||
|
||||
DataLocker.WriteUnlock();
|
||||
}
|
||||
|
||||
// Allocate new block
|
||||
result = { size, CurrentOffset };
|
||||
CurrentOffset += size;
|
||||
return result;
|
||||
}
|
||||
|
||||
// Frees allocated memory back to the global buffer.
|
||||
void Free(Allocation alloc)
|
||||
{
|
||||
PROFILE_MEM(Animations);
|
||||
ScopeLock lock(Updates.Locker());
|
||||
FreeList.Add(alloc);
|
||||
|
||||
// TODO: track active allocations count and roll back to offset 0 without free list when all allocations are freed to reduce fragmentation (eg. on scene changing)
|
||||
}
|
||||
|
||||
private:
|
||||
GPUBuffer* InitBuffer() const
|
||||
{
|
||||
GPUBuffer* buffer = GPUDevice::Instance->CreateBuffer(TEXT("BoneMatrices"));
|
||||
if (buffer->Init(GPUBufferDescription::Typed((int32)(CurrentSize / sizeof(Float4)), PixelFormat::R32G32B32A32_Float, false, GPUResourceUsage::Dynamic)))
|
||||
{
|
||||
LOG(Error, "Failed to initialize the skinned mesh bones buffer");
|
||||
SAFE_DELETE_GPU_RESOURCE(buffer);
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
public:
|
||||
// [RenderList::IExtension]
|
||||
void Dispose() override
|
||||
{
|
||||
// Free memory
|
||||
Updates.Clear();
|
||||
FreeList.Clear();
|
||||
CurrentOffset = 0;
|
||||
CurrentSize = 0;
|
||||
ReallocateSize = 0;
|
||||
SAFE_DELETE_GPU_RESOURCE(GlobalBuffer);
|
||||
}
|
||||
void PreDraw(GPUContext* context, RenderContextBatch& renderContextBatch) override
|
||||
{
|
||||
// Free pending updates to collect the during drawing
|
||||
Updates.Clear();
|
||||
UpdateSize = 0;
|
||||
|
||||
// Setup global buffer on GPU
|
||||
if (!CurrentSize)
|
||||
return;
|
||||
if (!GlobalBuffer)
|
||||
{
|
||||
GlobalBuffer = InitBuffer();
|
||||
ReallocateSize = 0;
|
||||
}
|
||||
else if (ReallocateSize)
|
||||
{
|
||||
auto newGlobalBuffer = InitBuffer();
|
||||
context->CopyBuffer(newGlobalBuffer, GlobalBuffer, ReallocateSize);
|
||||
GlobalBuffer->DeleteObject(1.0f); // Delay destruction
|
||||
GlobalBuffer = newGlobalBuffer;
|
||||
ReallocateSize = 0;
|
||||
}
|
||||
}
|
||||
void PostDraw(GPUContext* context, RenderContextBatch& renderContextBatch) override
|
||||
{
|
||||
const int32 count = Items.Count();
|
||||
const int32 count = Updates.Count();
|
||||
if (count == 0)
|
||||
return;
|
||||
PROFILE_GPU_CPU_NAMED("Update Bones");
|
||||
GPUMemoryPass pass(context);
|
||||
Item* items = Items.Get();
|
||||
ScopeWriteLock lock(DataLocker);
|
||||
|
||||
// Special case for D3D11 backend that doesn't need transitions
|
||||
if (context->GetDevice()->GetRendererType() <= RendererType::DirectX11)
|
||||
auto* updates = Updates.Get();
|
||||
auto globalBuffer = GlobalBuffer;
|
||||
auto globalData = Data.Get();
|
||||
if (context->GetDevice()->GetRendererType() <= RendererType::DirectX11 || // Dynamic buffer cannot be updated partially on D3D11 (hence D3D11_MAP_WRITE_DISCARD), so update the whole buffer
|
||||
count >= 1000 || // When updates count is large, it is more efficient to update the whole buffer at once
|
||||
UpdateSize >= (uint32)(0.7f * CurrentOffset)) // When modified size is large, it is more efficient to update the whole buffer at once
|
||||
{
|
||||
for (int32 i = 0; i < count; i++)
|
||||
{
|
||||
Item& item = items[i];
|
||||
context->UpdateBuffer(item.BoneMatrices, item.Data, item.Size);
|
||||
}
|
||||
// Update whole buffer at once
|
||||
context->UpdateBuffer(globalBuffer, globalData, CurrentOffset);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Batch resource barriers for buffer update
|
||||
for (int32 i = 0; i < count; i++)
|
||||
pass.Transition(items[i].BoneMatrices, GPUResourceAccess::CopyWrite);
|
||||
|
||||
// Update all buffers within Memory Pass (no barriers between)
|
||||
// Update all modified chunks of the buffer
|
||||
for (int32 i = 0; i < count; i++)
|
||||
{
|
||||
Item& item = items[i];
|
||||
context->UpdateBuffer(item.BoneMatrices, item.Data, item.Size);
|
||||
auto& item = updates[i];
|
||||
context->UpdateBuffer(globalBuffer, globalData + item.Offset, item.Size, item.Offset);
|
||||
}
|
||||
|
||||
// Batch resource barriers for reading in Vertex Shader
|
||||
for (int32 i = 0; i < count; i++)
|
||||
pass.Transition(items[i].BoneMatrices, GPUResourceAccess::ShaderReadGraphics);
|
||||
pass.Transition(globalBuffer, GPUResourceAccess::ShaderReadGraphics);
|
||||
}
|
||||
|
||||
#if COMPILE_WITH_PROFILER
|
||||
// Insert amount of kilobytes of data updated into profiler trace
|
||||
uint32 dataSize = 0;
|
||||
for (int32 i = 0; i < count; i++)
|
||||
dataSize += items[i].Size;
|
||||
ZoneValue(dataSize / 1024);
|
||||
ZoneValue(UpdateSize / 1024); // Trace amount of kilobytes of data updated
|
||||
#endif
|
||||
|
||||
Items.Clear();
|
||||
Updates.Clear();
|
||||
UpdateSize = 0;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -107,6 +201,94 @@ AnimatedModel::AnimatedModel(const SpawnParams& params)
|
||||
_sphere = BoundingSphere(Vector3::Zero, 0.0f);
|
||||
}
|
||||
|
||||
AnimatedModel::SkinnedBones::SkinnedBones()
|
||||
{
|
||||
static_assert(sizeof(*this) == sizeof(uint64), "Update size/alignment.");
|
||||
*(uint64*)this = 0;
|
||||
}
|
||||
|
||||
AnimatedModel::SkinnedBones::~SkinnedBones()
|
||||
{
|
||||
if (IsAllocated)
|
||||
{
|
||||
uint32 dataSize = BonesCount * sizeof(Matrix3x4) * (HasPrevBones ? 2 : 1);
|
||||
RenderListExtension.Free({ dataSize, GlobalBufferOffset });
|
||||
}
|
||||
}
|
||||
|
||||
void AnimatedModel::SkinnedBones::Update(const SkeletonData& skeleton, const Array<Matrix>& nodesPose, bool perBoneMotionBlur, bool reset)
|
||||
{
|
||||
const int32 bonesCount = skeleton.Bones.Count();
|
||||
|
||||
// Swap between two halves of the buffer for current/previous frame bones
|
||||
if (HasPrevBones)
|
||||
{
|
||||
IsPrevBones = !IsPrevBones;
|
||||
}
|
||||
|
||||
// Lazy-allocate from global buffer (double the size when using prev frame bones for per-bone motion vectors)
|
||||
if (!IsAllocated || BonesCount != bonesCount || HasPrevBones != perBoneMotionBlur)
|
||||
{
|
||||
if (IsAllocated)
|
||||
{
|
||||
uint32 dataSize = BonesCount * sizeof(Matrix3x4) * (HasPrevBones ? 2 : 1);
|
||||
RenderListExtension.Free({ dataSize, GlobalBufferOffset });
|
||||
}
|
||||
uint32 dataSize = bonesCount * sizeof(Matrix3x4) * (perBoneMotionBlur ? 2 : 1);
|
||||
auto alloc = RenderListExtension.Allocate(dataSize);
|
||||
GlobalBufferOffset = alloc.Offset;
|
||||
BonesCount = bonesCount;
|
||||
IsAllocated = true;
|
||||
IsPrevBones = false;
|
||||
IsPrevFlushed = false;
|
||||
HasPrevBones = perBoneMotionBlur;
|
||||
}
|
||||
else if (reset)
|
||||
{
|
||||
IsPrevBones = false;
|
||||
IsPrevFlushed = false;
|
||||
}
|
||||
|
||||
// Copy bones transformations to the CPU buffer (including bone offset matrix) and mark it as dirty to be flushed with GPU buffer later
|
||||
RenderListExtension.DataLocker.ReadLock();
|
||||
const SkeletonBone* bones = skeleton.Bones.Get();
|
||||
const Matrix* nodes = nodesPose.Get();
|
||||
Matrix3x4* output = (Matrix3x4*)(RenderListExtension.Data.Get() + GlobalBufferOffset); // DataLocker ensures it's safe to access (resizing happens within exclusive write-lock)
|
||||
if (IsPrevBones)
|
||||
output += BonesCount; // Write to the second half of the allocation
|
||||
ASSERT(nodesPose.Count() == skeleton.Nodes.Count());
|
||||
for (int32 boneIndex = 0; boneIndex < bonesCount; boneIndex++)
|
||||
{
|
||||
const SkeletonBone& bone = bones[boneIndex];
|
||||
Matrix matrix;
|
||||
Matrix::Multiply(bone.OffsetMatrix, nodes[bone.NodeIndex], matrix);
|
||||
output[boneIndex].SetMatrixTranspose(matrix);
|
||||
}
|
||||
RenderListExtension.DataLocker.ReadUnlock();
|
||||
IsDirty = true;
|
||||
}
|
||||
|
||||
void AnimatedModel::SkinnedBones::Flush()
|
||||
{
|
||||
uint32 size = BonesCount * sizeof(Matrix3x4);
|
||||
uint32 offset = GlobalBufferOffset;
|
||||
if (IsPrevBones)
|
||||
{
|
||||
// Write to the second half of the allocation (1st half will contain previous frame bones)
|
||||
offset += size;
|
||||
|
||||
// Mark initial flush of the previous frame bones
|
||||
IsPrevFlushed = true;
|
||||
}
|
||||
|
||||
// Add pending buffer update
|
||||
RenderListExtension.Updates.Add({ size, offset });
|
||||
Platform::InterlockedAdd(&RenderListExtension.UpdateSize, size);
|
||||
|
||||
// Clear dirty flag
|
||||
IsDirty = false;
|
||||
}
|
||||
|
||||
AnimatedModel::~AnimatedModel()
|
||||
{
|
||||
if (_deformation)
|
||||
@@ -139,15 +321,6 @@ void AnimatedModel::UpdateAnimation()
|
||||
|
||||
void AnimatedModel::SetupSkinningData()
|
||||
{
|
||||
ASSERT(SkinnedModel && SkinnedModel->IsLoaded());
|
||||
|
||||
const int32 targetBonesCount = SkinnedModel->Skeleton.Bones.Count();
|
||||
const int32 currentBonesCount = _skinningData.BonesCount;
|
||||
|
||||
if (targetBonesCount != currentBonesCount)
|
||||
{
|
||||
_skinningData.Setup(targetBonesCount);
|
||||
}
|
||||
}
|
||||
|
||||
void AnimatedModel::PreInitSkinningData()
|
||||
@@ -158,7 +331,6 @@ void AnimatedModel::PreInitSkinningData()
|
||||
PROFILE_MEM(Animations);
|
||||
ScopeLock lock(SkinnedModel->Locker);
|
||||
|
||||
SetupSkinningData();
|
||||
auto& skeleton = SkinnedModel->Skeleton;
|
||||
const int32 bonesCount = skeleton.Bones.Count();
|
||||
const int32 nodesCount = skeleton.Nodes.Count();
|
||||
@@ -180,15 +352,7 @@ void AnimatedModel::PreInitSkinningData()
|
||||
GraphInstance.RootTransform = nodesCount > 0 ? skeleton.Nodes[0].LocalTransform : Transform::Identity;
|
||||
|
||||
// Setup bones transformations including bone offset matrix
|
||||
Matrix3x4* output = (Matrix3x4*)_skinningData.Data.Get();
|
||||
const SkeletonBone* bones = skeleton.Bones.Get();
|
||||
for (int32 boneIndex = 0; boneIndex < bonesCount; boneIndex++)
|
||||
{
|
||||
auto& bone = bones[boneIndex];
|
||||
Matrix identityMatrix = bone.OffsetMatrix * nodesPose[bone.NodeIndex];
|
||||
output[boneIndex].SetMatrixTranspose(identityMatrix);
|
||||
}
|
||||
_skinningData.OnDataChanged(true);
|
||||
_bones.Update(skeleton, GraphInstance.NodesPose, PerBoneMotionBlur, true);
|
||||
|
||||
UpdateBounds();
|
||||
UpdateSockets();
|
||||
@@ -926,18 +1090,7 @@ void AnimatedModel::OnAnimationUpdated_Async()
|
||||
// Calculate the final bones transformations and update skinning
|
||||
{
|
||||
ANIM_GRAPH_PROFILE_EVENT("Final Pose");
|
||||
const int32 bonesCount = skeleton.Bones.Count();
|
||||
Matrix3x4* output = (Matrix3x4*)_skinningData.Data.Get();
|
||||
ASSERT(GraphInstance.NodesPose.Count() == skeleton.Nodes.Count());
|
||||
ASSERT(_skinningData.Data.Count() == bonesCount * sizeof(Matrix3x4));
|
||||
for (int32 boneIndex = 0; boneIndex < bonesCount; boneIndex++)
|
||||
{
|
||||
const SkeletonBone& bone = skeleton.Bones[boneIndex];
|
||||
Matrix matrix;
|
||||
Matrix::Multiply(bone.OffsetMatrix, GraphInstance.NodesPose.Get()[bone.NodeIndex], matrix);
|
||||
output[boneIndex].SetMatrixTranspose(matrix);
|
||||
}
|
||||
_skinningData.OnDataChanged(!PerBoneMotionBlur);
|
||||
_bones.Update(skeleton, GraphInstance.NodesPose, PerBoneMotionBlur);
|
||||
}
|
||||
|
||||
//if (UpdateWhenOffscreen)
|
||||
@@ -1077,18 +1230,26 @@ void AnimatedModel::Draw(RenderContext& renderContext)
|
||||
GEOMETRY_DRAW_STATE_EVENT_BEGIN(_drawState, world);
|
||||
|
||||
_lastMinDstSqr = Math::Min(_lastMinDstSqr, Vector3::DistanceSquared(_transform.Translation, renderContext.View.WorldPosition));
|
||||
if (_skinningData.IsReady())
|
||||
if (_bones.IsAllocated)
|
||||
{
|
||||
// Flush skinning data with GPU
|
||||
if (_skinningData.IsDirty())
|
||||
{
|
||||
RenderListExtension.Items.Add({ _skinningData.BoneMatrices, _skinningData.Data.Get(), _skinningData.Data.Count() });
|
||||
_skinningData.OnFlush();
|
||||
}
|
||||
if (_bones.IsDirty)
|
||||
_bones.Flush();
|
||||
|
||||
SkinnedMesh::DrawInfo draw;
|
||||
draw.Buffer = &Entries;
|
||||
draw.Skinning = &_skinningData;
|
||||
draw.SkinningBones = RenderListExtension.GlobalBuffer;
|
||||
draw.SkinningBonesOffset = _bones.GlobalBufferOffset / sizeof(Matrix3x4);
|
||||
draw.WithPrevBones = _bones.HasPrevBones && _bones.IsPrevFlushed;
|
||||
if (draw.WithPrevBones)
|
||||
{
|
||||
draw.PrevBonesOffset = _bones.BonesCount;
|
||||
if (_bones.IsPrevBones)
|
||||
{
|
||||
draw.SkinningBonesOffset += draw.PrevBonesOffset;
|
||||
draw.PrevBonesOffset = -draw.PrevBonesOffset;
|
||||
}
|
||||
}
|
||||
draw.World = &world;
|
||||
draw.DrawState = &_drawState;
|
||||
draw.Deformation = _deformation;
|
||||
@@ -1120,18 +1281,26 @@ void AnimatedModel::Draw(RenderContextBatch& renderContextBatch)
|
||||
GEOMETRY_DRAW_STATE_EVENT_BEGIN(_drawState, world);
|
||||
|
||||
_lastMinDstSqr = Math::Min(_lastMinDstSqr, Vector3::DistanceSquared(_transform.Translation, renderContext.View.WorldPosition));
|
||||
if (_skinningData.IsReady())
|
||||
if (_bones.IsAllocated)
|
||||
{
|
||||
// Flush skinning data with GPU
|
||||
if (_skinningData.IsDirty())
|
||||
{
|
||||
RenderListExtension.Items.Add({ _skinningData.BoneMatrices, _skinningData.Data.Get(), _skinningData.Data.Count() });
|
||||
_skinningData.OnFlush();
|
||||
}
|
||||
if (_bones.IsDirty)
|
||||
_bones.Flush();
|
||||
|
||||
SkinnedMesh::DrawInfo draw;
|
||||
draw.Buffer = &Entries;
|
||||
draw.Skinning = &_skinningData;
|
||||
draw.SkinningBones = RenderListExtension.GlobalBuffer;
|
||||
draw.SkinningBonesOffset = _bones.GlobalBufferOffset / sizeof(Matrix3x4);
|
||||
draw.WithPrevBones = _bones.HasPrevBones && _bones.IsPrevFlushed;
|
||||
if (draw.WithPrevBones)
|
||||
{
|
||||
draw.PrevBonesOffset = _bones.BonesCount;
|
||||
if (_bones.IsPrevBones)
|
||||
{
|
||||
draw.SkinningBonesOffset += draw.PrevBonesOffset;
|
||||
draw.PrevBonesOffset = -draw.PrevBonesOffset;
|
||||
}
|
||||
}
|
||||
draw.World = &world;
|
||||
draw.DrawState = &_drawState;
|
||||
draw.Deformation = _deformation;
|
||||
|
||||
Reference in New Issue
Block a user