UE4 Profiler 性能分析工具原理和实现机制

Viewed 11

一言以蔽之

UE4 的 Profiler 基于 Instrumentation ,通过 STAT 相关 Macro 来埋点插桩、通过调用各平台的高分辨率时间戳来测量时间间隔等。

相关工具

Session Frontend
Stat

所用方案

Instrumentation
详见性能分析工具的实现原理探究

自定义埋点

步骤

定义

DECLARE_STATS_GROUP(TEXT("MarsTest"), STATGROUP_MarsTest, STATCAT_Advanced)
DECLARE_CYCLE_STAT(TEXT("MarsTest_Character_Tick"), STAT_MarsTest_Character_Tick, STATGROUP_MarsTest)

使用

SCOPE_CYCLE_COUNTER(STAT_MarsTest_Character_Tick)

效果图

Stat MarsTest

Frontend - Profiler

核心原理

埋点

DECLARE_STATS_GROUP(TEXT("MarsTest"), STATGROUP_MarsTest, STATCAT_Advanced)

# define DECLARE_STATS_GROUP(GroupDesc, GroupId, GroupCat) \
	DECLARE_STAT_GROUP(GroupDesc, GroupId, GroupCat, true, true, false);

# define DECLARE_STAT_GROUP(Description, StatName, StatCategory, InDefaultEnable, InCompileTimeEnable, InSortByName) \
struct FStatGroup_##StatName\
{ \
	enum \
	{ \
		DefaultEnable = InDefaultEnable, \
		CompileTimeEnable = InCompileTimeEnable, \
		SortByName = InSortByName \
	}; \
	static FORCEINLINE const char* GetGroupName() \
	{ \
		return #StatName; \
	} \
	static FORCEINLINE const char* GetGroupCategory() \
	{ \
		return #StatCategory; \
	} \
	static FORCEINLINE const TCHAR* GetDescription() \
	{ \
		return Description; \
	} \
	static FORCEINLINE bool IsDefaultEnabled() \
	{ \
		return (bool)DefaultEnable; \
	} \
	static FORCEINLINE bool IsCompileTimeEnable() \
	{ \
		return (bool)CompileTimeEnable; \
	} \
	static FORCEINLINE bool GetSortByName() \
	{ \
		return (bool)SortByName; \
	} \
};

DECLARE_CYCLE_STAT(TEXT("MarsTest_Character_Tick"), STAT_MarsTest_Character_Tick, STATGROUP_MarsTest)

# define DECLARE_CYCLE_STAT(CounterName,StatId,GroupId) \
	DECLARE_STAT(CounterName,StatId,GroupId,EStatDataType::ST_int64, true, true, FPlatformMemory::MCR_Invalid); \
	static DEFINE_STAT(StatId)

# define DEFINE_STAT(Stat) \
	struct FThreadSafeStaticStat<FStat_##Stat> StatPtr_##Stat;

template<class TStatData>
struct FThreadSafeStaticStat : public FThreadSafeStaticStatInner<TStatData, TStatData::TGroup::CompileTimeEnable>
{
	FThreadSafeStaticStat()
	{
		//This call will result in registering the Group if it's compile time enabled. 
		//It fixes a bug when a StatGroup only has counters that are using the INC_\DEC_ macros. 
		//Those macros are guarded for the stats collection to be active which prevented the registration of the stat group.
		//It was not possible to activate the stat group unless another was already active.
		//Most groups are registered when a FScopeCycleCounter is declared as GetStatId is called as the constructor parameter.
		FThreadSafeStaticStatInner<TStatData, TStatData::TGroup::CompileTimeEnable>::GetStatId();
	}
};

template<class TStatData, bool TCompiledIn>
struct FThreadSafeStaticStatInner : public FThreadSafeStaticStatBase
{
	FORCEINLINE_STATS TStatId GetStatId() const
	{
		const TStatIdData* LocalHighPerformanceEnable = HighPerformanceEnable.Load(EMemoryOrder::Relaxed);
		if (!LocalHighPerformanceEnable)
		{
			LocalHighPerformanceEnable = DoSetup(TStatData::GetStatName(), TStatData::GetDescription(), TStatData::TGroup::GetGroupName(), TStatData::TGroup::GetGroupCategory(), TStatData::TGroup::GetDescription(), TStatData::TGroup::IsDefaultEnabled(), TStatData::IsClearEveryFrame(), TStatData::GetStatType(), TStatData::IsCycleStat(), TStatData::TGroup::GetSortByName(), TStatData::GetMemoryRegion() );
		}
		return TStatId(LocalHighPerformanceEnable);
	}
	FORCEINLINE FName GetStatFName() const
	{
		return GetStatId().GetName();
	}
};

const TStatIdData* FThreadSafeStaticStatBase::DoSetup(const char* InStatName, const TCHAR* InStatDesc, const char* InGroupName, const char* InGroupCategory, const TCHAR* InGroupDesc, bool bDefaultEnable, bool bShouldClearEveryFrame, EStatDataType::Type InStatType, bool bCycleStat, bool bSortByName, FPlatformMemory::EMemoryCounterRegion InMemoryRegion) const
{
	FName TempName(InStatName);

	// send meta data, we don't use normal messages because the stats thread might not be running yet
	FStartupMessages::Get().AddMetadata(TempName, InStatDesc, InGroupName, InGroupCategory, InGroupDesc, bShouldClearEveryFrame, InStatType, bCycleStat, bSortByName, InMemoryRegion);

	TStatIdData const* LocalHighPerformanceEnable(IStatGroupEnableManager::Get().GetHighPerformanceEnableForStat(FName(InStatName), InGroupName, InGroupCategory, bDefaultEnable, bShouldClearEveryFrame, InStatType, InStatDesc, bCycleStat, bSortByName, InMemoryRegion).GetRawPointer());
	TStatIdData const* OldHighPerformanceEnable = HighPerformanceEnable.Exchange(LocalHighPerformanceEnable);
	check(!OldHighPerformanceEnable || LocalHighPerformanceEnable == OldHighPerformanceEnable); // we are assigned two different groups?

	return LocalHighPerformanceEnable;
}

SCOPE_CYCLE_COUNTER(STAT_MarsTest_Character_Tick)

# define SCOPE_CYCLE_COUNTER(Stat) \
	FScopeCycleCounter CycleCount_##Stat(GET_STATID(Stat));

class FScopeCycleCounter : public FCycleCounter
{
public:
	/**
	 * Pushes the specified stat onto the hierarchy for this thread. Starts
	 * the timing of the cycles used
	 */
	FORCEINLINE_STATS FScopeCycleCounter( TStatId StatId, bool bAlways = false )
	{
		Start( StatId, bAlways );
	}

	/**
	 * Updates the stat with the time spent
	 */
	FORCEINLINE_STATS ~FScopeCycleCounter()
	{
		Stop();
	}

};

FORCEINLINE_STATS void Stop()
	{
		if ( EmittedEvent & NamedEvent )
		{
			FPlatformMisc::EndNamedEvent();
		}

# if CPUPROFILERTRACE_ENABLED
		if (EmittedEvent & TraceEvent)
		{
			FCpuProfilerTrace::OutputEndEvent();
		}
# endif

		if(EmittedEvent & ThreadStatsEvent)
		{
			FThreadStats::AddMessage(StatId, EStatOperation::CycleScopeEnd);
		}

		EmittedEvent = 0;
	}

开销时间的获取

调用各平台的高分辨率时间戳来测量时间间隔

FStatMessage

GetValue_int64()= int64(FPlatformTime::Cycles());

FGenericPlatformTime

FGenericPlatformTime

static FORCEINLINE uint32 Cycles()
	{
		struct timeval tv;
		gettimeofday( &tv, NULL );
		return (uint32) ((((uint64)tv.tv_sec) * 1000000ULL) + (((uint64)tv.tv_usec)));
	}

FWindowsPlatformTime

    static FORCEINLINE uint32 Cycles()
	{
		Windows::LARGE_INTEGER Cycles;
		Windows::QueryPerformanceCounter(&Cycles);
		return (uint32)Cycles.QuadPart;
	}

FSonyPlatformTime

    static FORCEINLINE uint32 Cycles()
	{
		uint64 Time = sceKernelGetProcessTimeCounter() >> CycleShift;
		return (uint32)Time;
	}

This function returns the value of the monotonic 64-bit counter that is synchronized with the process time.
FUnixTime

static FORCEINLINE uint32 Cycles()
	{
		struct timespec ts;
		clock_gettime(CLOCK_MONOTONIC, &ts);
		return static_cast<uint32>(static_cast<uint64>(ts.tv_sec) * 1000000ULL + static_cast<uint64>(ts.tv_nsec) / 1000ULL);
	}

FApplePlatformTime
static FORCEINLINE uint32 Cycles()

{
		uint64 Cycles = mach_absolute_time();
		return Cycles;
	}

FAndroidTime

static FORCEINLINE uint32 Cycles()
	{
		struct timespec ts;
		clock_gettime(CLOCK_MONOTONIC, &ts);
		return (uint32) ((((uint64)ts.tv_sec) * 1000000ULL) + (((uint64)ts.tv_nsec) / 1000ULL));
	}

性能数据从获取到显示的全流程

活动图

UE4 Profile Mainflow Activity Diagram

类图

UE4 Profile Main Class Diagram

时序图

全流程

UE4 Profile Mainflow Full Sequence Diagram

UE4 Profile Mainflow Simplified Sequence Diagram

子流程拆解

Add FStatMessage

Handle Ticker

Set InDurationCycles by FStatMessage

Set StatData by InDurationCycles

Set InInclusiveTimeMS by StatData

Update EventGraphs

Display Profiler Event Graph

扩展阅读

UE4-程序性能优化与调试相关笔记
[UE4]Statコマンドに情報を追加しよう
StatsSystemOverview
在UE4C++中的宏
C++宏(Macro)的各种玩法
C/C++中宏/Macro的深入讲解
C/C++获取时间方法:gettimeofday()
QueryPerformanceCounter (QPC) - 获取高分辨率时间戳
QueryPerformanceFrequency function
QueryPerformanceCounter function
使用QueryPerformanceFrequency、QueryPerformanceCounter精确计时

**声明:**本文来自公众号:GameDevLearning,转载请附上原文链接及本声明。

0 Answers