Major refactor and extension of perfdata

All data now collated in a separate thread via lock free queue data for all "self" attachments collected including non-rigged known issues: some double counting exists * in non rigged alpha mask, maybe elsewhere
2021-10-21 13:18:45 +01:00 · 2021-10-21 13:18:45 +01:00 · a4a7a765f1
parent 390c136430
commit a4a7a765f1
26 changed files with 5270 additions and 422 deletions
--- a/indra/llcommon/blockingconcurrentqueue.h
+++ b/indra/llcommon/blockingconcurrentqueue.h
@ -0,0 +1,582 @@
+// Provides an efficient blocking version of moodycamel::ConcurrentQueue.
+// ©2015-2020 Cameron Desrochers. Distributed under the terms of the simplified
+// BSD license, available at the top of concurrentqueue.h.
+// Also dual-licensed under the Boost Software License (see LICENSE.md)
+// Uses Jeff Preshing's semaphore implementation (under the terms of its
+// separate zlib license, see lightweightsemaphore.h).
+
+#pragma once
+
+#include "concurrentqueue.h"
+#include "lightweightsemaphore.h"
+
+#include <type_traits>
+#include <cerrno>
+#include <memory>
+#include <chrono>
+#include <ctime>
+
+namespace moodycamel
+{
+// This is a blocking version of the queue. It has an almost identical interface to
+// the normal non-blocking version, with the addition of various wait_dequeue() methods
+// and the removal of producer-specific dequeue methods.
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class BlockingConcurrentQueue
+{
+private:
+	typedef ::moodycamel::ConcurrentQueue<T, Traits> ConcurrentQueue;
+	typedef ::moodycamel::LightweightSemaphore LightweightSemaphore;
+
+public:
+	typedef typename ConcurrentQueue::producer_token_t producer_token_t;
+	typedef typename ConcurrentQueue::consumer_token_t consumer_token_t;
+	
+	typedef typename ConcurrentQueue::index_t index_t;
+	typedef typename ConcurrentQueue::size_t size_t;
+	typedef typename std::make_signed<size_t>::type ssize_t;
+	
+	static const size_t BLOCK_SIZE = ConcurrentQueue::BLOCK_SIZE;
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = ConcurrentQueue::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD;
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::EXPLICIT_INITIAL_INDEX_SIZE;
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::IMPLICIT_INITIAL_INDEX_SIZE;
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = ConcurrentQueue::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = ConcurrentQueue::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE;
+	static const size_t MAX_SUBQUEUE_SIZE = ConcurrentQueue::MAX_SUBQUEUE_SIZE;
+	
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit BlockingConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE)
+		: inner(capacity), sema(create<LightweightSemaphore, ssize_t, int>(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy<LightweightSemaphore>)
+	{
+		assert(reinterpret_cast<ConcurrentQueue*>((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member");
+		if (!sema) {
+			MOODYCAMEL_THROW(std::bad_alloc());
+		}
+	}
+	
+	BlockingConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+		: inner(minCapacity, maxExplicitProducers, maxImplicitProducers), sema(create<LightweightSemaphore, ssize_t, int>(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy<LightweightSemaphore>)
+	{
+		assert(reinterpret_cast<ConcurrentQueue*>((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member");
+		if (!sema) {
+			MOODYCAMEL_THROW(std::bad_alloc());
+		}
+	}
+	
+	// Disable copying and copy assignment
+	BlockingConcurrentQueue(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	BlockingConcurrentQueue& operator=(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+	// Moving is supported, but note that it is *not* a thread-safe operation.
+	// Nobody can use the queue while it's being moved, and the memory effects
+	// of that move must be propagated to other threads before they can use it.
+	// Note: When a queue is moved, its tokens are still valid but can only be
+	// used with the destination queue (i.e. semantically they are moved along
+	// with the queue itself).
+	BlockingConcurrentQueue(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+		: inner(std::move(other.inner)), sema(std::move(other.sema))
+	{ }
+	
+	inline BlockingConcurrentQueue& operator=(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+	{
+		return swap_internal(other);
+	}
+	
+	// Swaps this queue's state with the other's. Not thread-safe.
+	// Swapping two queues does not invalidate their tokens, however
+	// the tokens that were created for one queue must be used with
+	// only the swapped queue (i.e. the tokens are tied to the
+	// queue's movable state, not the object itself).
+	inline void swap(BlockingConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap_internal(other);
+	}
+	
+private:
+	BlockingConcurrentQueue& swap_internal(BlockingConcurrentQueue& other)
+	{
+		if (this == &other) {
+			return *this;
+		}
+		
+		inner.swap(other.inner);
+		sema.swap(other.sema);
+		return *this;
+	}
+	
+public:
+	// Enqueues a single item (by copying it).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T const& item)
+	{
+		if ((details::likely)(inner.enqueue(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T&& item)
+	{
+		if ((details::likely)(inner.enqueue(std::move(item)))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T const& item)
+	{
+		if ((details::likely)(inner.enqueue(token, item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T&& item)
+	{
+		if ((details::likely)(inner.enqueue(token, std::move(item)))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool enqueue_bulk(It itemFirst, size_t count)
+	{
+		if ((details::likely)(inner.enqueue_bulk(std::forward<It>(itemFirst), count))) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails
+	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		if ((details::likely)(inner.enqueue_bulk(token, std::forward<It>(itemFirst), count))) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it).
+	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T const& item)
+	{
+		if (inner.try_enqueue(item)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T&& item)
+	{
+		if (inner.try_enqueue(std::move(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T const& item)
+	{
+		if (inner.try_enqueue(token, item)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T&& item)
+	{
+		if (inner.try_enqueue(token, std::move(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items.
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool try_enqueue_bulk(It itemFirst, size_t count)
+	{
+		if (inner.try_enqueue_bulk(std::forward<It>(itemFirst), count)) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		if (inner.try_enqueue_bulk(token, std::forward<It>(itemFirst), count)) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue(U& item)
+	{
+		if (sema->tryWait()) {
+			while (!inner.try_dequeue(item)) {
+				continue;
+			}
+			return true;
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue using an explicit consumer token.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue(consumer_token_t& token, U& item)
+	{
+		if (sema->tryWait()) {
+			while (!inner.try_dequeue(token, item)) {
+				continue;
+			}
+			return true;
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	
+	
+	// Blocks the current thread until there's something to dequeue, then
+	// dequeues it.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline void wait_dequeue(U& item)
+	{
+		while (!sema->wait()) {
+			continue;
+		}
+		while (!inner.try_dequeue(item)) {
+			continue;
+		}
+	}
+
+	// Blocks the current thread until either there's something to dequeue
+	// or the timeout (specified in microseconds) expires. Returns false
+	// without setting `item` if the timeout expires, otherwise assigns
+	// to `item` and returns true.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool wait_dequeue_timed(U& item, std::int64_t timeout_usecs)
+	{
+		if (!sema->wait(timeout_usecs)) {
+			return false;
+		}
+		while (!inner.try_dequeue(item)) {
+			continue;
+		}
+		return true;
+	}
+    
+    // Blocks the current thread until either there's something to dequeue
+	// or the timeout expires. Returns false without setting `item` if the
+    // timeout expires, otherwise assigns to `item` and returns true.
+	// Never allocates. Thread-safe.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(U& item, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_timed(item, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Blocks the current thread until there's something to dequeue, then
+	// dequeues it using an explicit consumer token.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline void wait_dequeue(consumer_token_t& token, U& item)
+	{
+		while (!sema->wait()) {
+			continue;
+		}
+		while (!inner.try_dequeue(token, item)) {
+			continue;
+		}
+	}
+	
+	// Blocks the current thread until either there's something to dequeue
+	// or the timeout (specified in microseconds) expires. Returns false
+	// without setting `item` if the timeout expires, otherwise assigns
+	// to `item` and returns true.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::int64_t timeout_usecs)
+	{
+		if (!sema->wait(timeout_usecs)) {
+			return false;
+		}
+		while (!inner.try_dequeue(token, item)) {
+			continue;
+		}
+		return true;
+	}
+    
+    // Blocks the current thread until either there's something to dequeue
+	// or the timeout expires. Returns false without setting `item` if the
+    // timeout expires, otherwise assigns to `item` and returns true.
+	// Never allocates. Thread-safe.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_timed(token, item, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which will
+	// always be at least one (this method blocks until the queue
+	// is non-empty) and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue_bulk.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::int64_t timeout_usecs)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+    
+    // Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It, typename Rep, typename Period>
+	inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_bulk_timed<It&>(itemFirst, max, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which will
+	// always be at least one (this method blocks until the queue
+	// is non-empty) and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue_bulk.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::int64_t timeout_usecs)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It, typename Rep, typename Period>
+	inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_bulk_timed<It&>(token, itemFirst, max, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	inline size_t size_approx() const
+	{
+		return (size_t)sema->availableApprox();
+	}
+	
+	
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static bool is_lock_free()
+	{
+		return ConcurrentQueue::is_lock_free();
+	}
+	
+
+private:
+	template<typename U, typename A1, typename A2>
+	static inline U* create(A1&& a1, A2&& a2)
+	{
+		void* p = (Traits::malloc)(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1), std::forward<A2>(a2)) : nullptr;
+	}
+	
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr) {
+			p->~U();
+		}
+		(Traits::free)(p);
+	}
+	
+private:
+	ConcurrentQueue inner;
+	std::unique_ptr<LightweightSemaphore, void (*)(LightweightSemaphore*)> sema;
+};
+
+
+template<typename T, typename Traits>
+inline void swap(BlockingConcurrentQueue<T, Traits>& a, BlockingConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+}	// end namespace moodycamel
--- a/indra/llcommon/concurrentqueue.h
+++ b/indra/llcommon/concurrentqueue.h
--- a/indra/llcommon/fsperfstats.cpp
+++ b/indra/llcommon/fsperfstats.cpp
@ -27,10 +27,10 @@
 #include "fsperfstats.h"
 namespace FSPerfStats
 {
-    int 	RecordSceneTime::writeBuffer{0};
-
-    bool 	RecordSceneTime::collectionEnabled{true};
-
-	std::array< typename RecordSceneTime::StatsArray, 2 > RecordSceneTime::stats{ {} };
+    std::atomic<int> 	StatsRecorder::writeBuffer{0};
+    bool 	            StatsRecorder::collectionEnabled{true};
+	std::array<StatsRecorder::StatsTypeMatrix,2>  StatsRecorder::statsDoubleBuffer{ {} };
+    std::array<StatsRecorder::StatsSummaryArray,2> StatsRecorder::max{ {} };
+    std::array<StatsRecorder::StatsSummaryArray,2> StatsRecorder::sum{ {} };

 }
--- a/indra/llcommon/fsperfstats.h
+++ b/indra/llcommon/fsperfstats.h
@ -28,19 +28,28 @@
 * $/LicenseInfo$
 */

+#include <atomic>
 #include <chrono>
 #include <array>
 #include <unordered_map>
+#include "lluuid.h"
+#include "lltimer.h"
+#include "blockingconcurrentqueue.h"
+#include "llapp.h"
+#include "fstelemetry.h"

+extern LLUUID gAgentID;
 namespace FSPerfStats
 {
-   	enum class ObjStatType_t{
-		RENDER_GEOMETRY=0,
-		RENDER_SHADOWS,
-		RENDER_COMBINED,
-		STATS_COUNT
+
+   	enum class ObjType_t{
+		OT_GENERAL=0, // Also Unknown. Used for n/a type stats such as scenery
+		OT_AVATAR,
+		OT_ATTACHMENT,
+		OT_HUD,
+        OT_COUNT
 	};
-   	enum class SceneStatType_t{
+   	enum class StatType_t{
 		RENDER_GEOMETRY=0,
 		RENDER_SHADOWS,
 		RENDER_HUDS,
@ -57,243 +66,251 @@ namespace FSPerfStats
 		STATS_COUNT
 	};
    
-    using ObjStatType = ObjStatType_t;
-    using SceneStatType = SceneStatType_t;
+    struct StatsRecord
+    { 
+        StatType_t  statType;
+        ObjType_t   objType;
+        LLUUID      objID;
+        uint64_t    time;
+    };

-	class RecordSceneTime
-	{
-        using StatsEnum = SceneStatType;
-        using StatsArray = std::array<uint64_t, static_cast<size_t>(StatsEnum::STATS_COUNT)>;
-        // using StatsBlock = std::unordered_map<T, StatsArray>;
-
-		static int writeBuffer;
-        static std::array<StatsArray,2> stats;
-        static bool collectionEnabled;
-
-        RecordSceneTime(const RecordSceneTime&) = delete;
-        RecordSceneTime() = delete;
-
-        const StatsEnum type;
-        std::chrono::steady_clock::time_point start;
+    class StatsRecorder{
+        using Queue = moodycamel::BlockingConcurrentQueue<StatsRecord>;
    public:
-
-        static inline void enable(){collectionEnabled=true;};
-        static inline void disable(){collectionEnabled=false;};
-        static inline bool enabled(){return(collectionEnabled);};
-
-        RecordSceneTime(SceneStatType type):start{std::chrono::steady_clock::now()}, type{type} {}
-
-        ~RecordSceneTime()
-        { 
-            auto val = std::chrono::duration<uint64_t, std::nano>(std::chrono::steady_clock::now() - start).count();
-            stats[writeBuffer][static_cast<size_t>(type)] += val;
-        };
-
-        static inline void toggleBuffer()
+        static inline StatsRecorder& getInstance()
        {
-            if(enabled())
-            {
-                // stats[writeBuffer][static_cast<size_t>(SceneStatType::RENDER_FPS)] = LLTrace::get_frame_recording().getPeriodMeanPerSec(LLStatViewer::FPS,3); // last 3 Frames
-                writeBuffer = (writeBuffer+1)%2;
-            }; // not we are relying on atomic updates here. The risk is low and would cause minor errors in the stats display. 
-
-            auto& statsArray = stats[writeBuffer];
-            std::fill_n(statsArray.begin() ,static_cast<size_t>(SceneStatType::STATS_COUNT),0);
+            static StatsRecorder instance;
+            // volatile int dummy{};
+            return instance;
        }
-		static inline int getReadBufferIndex(){return (writeBuffer+1)%2;};
-        static inline StatsArray getCurrentStatsBuffer(){ return stats[getReadBufferIndex()];}
-        static inline uint64_t get(StatsEnum type){return stats[getReadBufferIndex()][static_cast<size_t>(type)];}
-	};
-	
-    template <typename T>
-    class RecordObjectTime
-	{
-        using StatsEnum = ObjStatType;
-        using StatsArray = std::array<uint64_t, static_cast<size_t>(StatsEnum::STATS_COUNT)>;
-        using StatsBlock = std::unordered_map<T, StatsArray>;

-		static int writeBuffer;
-        static std::array<StatsBlock,2> stats;
+        static inline void send(const StatsRecord& u){StatsRecorder::getInstance().q.enqueue(u);};
+        static inline void endFrame(){StatsRecorder::getInstance().q.enqueue(StatsRecord{});};

-		static std::array<StatsArray,2> max;
-		static std::array<StatsArray,2> sum;
+        static inline void enable()     { collectionEnabled=true; };
+        static inline void disable()    { collectionEnabled=false; };
+        static inline bool enabled()    { return(collectionEnabled); };
+
+		static inline int getReadBufferIndex() { return (writeBuffer ^ 1); };
+        // static inline const StatsTypeMatrix& getCurrentStatsMatrix(){ return statsDoubleBuffer[getReadBufferIndex()];}
+        static inline uint64_t get(ObjType_t otype, LLUUID id, StatType_t type)
+        {
+            return statsDoubleBuffer[getReadBufferIndex()][static_cast<size_t>(otype)][id][static_cast<size_t>(type)];
+        }
+        static inline uint64_t getSceneStat(StatType_t type)
+        {
+            static const LLUUID null_id{};
+            return statsDoubleBuffer[getReadBufferIndex()][static_cast<size_t>(ObjType_t::OT_GENERAL)][null_id][static_cast<size_t>(type)];
+        }
+
+        static inline uint64_t getSum(ObjType_t otype, StatType_t type)
+        {
+            return sum[getReadBufferIndex()][static_cast<size_t>(otype)][static_cast<size_t>(type)];
+        }
+        static inline uint64_t getMax(ObjType_t otype, StatType_t type)
+        {
+            return max[getReadBufferIndex()][static_cast<size_t>(otype)][static_cast<size_t>(type)];
+        }
+    private:
+        StatsRecorder():q(100),t(&StatsRecorder::run)
+        {
+            // create a queue
+            // create a thread to consume from the queue
+            t.detach();
+        }
+
+// StatsArray is a uint64_t for each possible statistic type.
+        using StatsArray    = std::array<uint64_t, static_cast<size_t>(FSPerfStats::StatType_t::STATS_COUNT)>;
+        using StatsMap      = std::unordered_map<LLUUID, StatsArray, FSUUIDHash>;
+        using StatsTypeMatrix = std::array<StatsMap, static_cast<size_t>(FSPerfStats::ObjType_t::OT_COUNT)>;
+        using StatsSummaryArray = std::array<StatsArray, static_cast<size_t>(FSPerfStats::ObjType_t::OT_COUNT)>;
+
+		static std::atomic<int> writeBuffer;
+        static std::array<StatsTypeMatrix,2> statsDoubleBuffer;
+        static std::array<StatsSummaryArray,2> max;
+        static std::array<StatsSummaryArray,2> sum;
        static bool collectionEnabled;

-        RecordObjectTime(const RecordObjectTime&) = delete;
-        RecordObjectTime() = delete;
-        const T key;
-        const StatsEnum type;
-        std::chrono::steady_clock::time_point start;

-    public:
-        static inline void enable(){collectionEnabled=true;};
-        static inline void disable(){collectionEnabled=false;};
-        static inline bool enabled(){return(collectionEnabled);};
-
-        RecordObjectTime(T key, ObjStatType type):start{std::chrono::steady_clock::now()}, key{key}, type{type} {}
-
-        ~RecordObjectTime()
-        { 
-            using ST = StatsEnum;
+        void processUpdate(const StatsRecord& upd)
+        {
+            FSZone;
+            using ST = StatType_t;
            // Note: nullptr is used as the key for global stats
            constexpr auto period{500};
-            auto val = std::chrono::duration<uint64_t, std::nano>(std::chrono::steady_clock::now() - start).count();
-            if(key)
+            if(upd.statType == StatType_t::RENDER_GEOMETRY && upd.objType == ObjType_t::OT_GENERAL && upd.objID == LLUUID{} && upd.time == 0)
            {
-                stats[writeBuffer][key][static_cast<size_t>(type)] += val;
-                stats[writeBuffer][key][static_cast<size_t>(ST::RENDER_COMBINED)] += val;
-                if(max[writeBuffer][static_cast<size_t>(type)] < stats[writeBuffer][key][static_cast<size_t>(type)])
-                {
-                    max[writeBuffer][static_cast<size_t>(type)] = stats[writeBuffer][key][static_cast<size_t>(type)];
-                }
-                if(max[writeBuffer][static_cast<size_t>(ST::RENDER_COMBINED)] < stats[writeBuffer][key][static_cast<size_t>(ST::RENDER_COMBINED)])
-                {
-                    max[writeBuffer][static_cast<size_t>(ST::RENDER_COMBINED)] = stats[writeBuffer][key][static_cast<size_t>(ST::RENDER_COMBINED)];
-                }
-                sum[writeBuffer][static_cast<size_t>(type)] += val;
-                sum[writeBuffer][static_cast<size_t>(ST::RENDER_COMBINED)] += val;
+                toggleBuffer();
+                return;
            }
-        };
+
+            StatsMap& stm {statsDoubleBuffer[writeBuffer][static_cast<size_t>(upd.objType)]};
+            auto& key{upd.objID};
+            auto val {upd.time};
+            auto type {upd.statType};
+            FSZoneText(key.asString().c_str(), 36);
+            FSZoneValue(val);
+            auto& thisAsset = stm[key];
+            thisAsset[static_cast<size_t>(type)] += val;
+            thisAsset[static_cast<size_t>(ST::RENDER_COMBINED)] += val;
+            FSZoneValue(thisAsset[static_cast<size_t>(type)]);
+            sum[writeBuffer][static_cast<size_t>(upd.objType)][static_cast<size_t>(type)] += val;
+            sum[writeBuffer][static_cast<size_t>(upd.objType)][static_cast<size_t>(ST::RENDER_COMBINED)] += val;
+            FSZoneValue(static_cast<size_t>(upd.objType));
+            FSZoneValue(statsDoubleBuffer[writeBuffer][static_cast<size_t>(upd.objType)][key][static_cast<size_t>(ST::RENDER_COMBINED)]);
+            if(max[writeBuffer][static_cast<size_t>(upd.objType)][static_cast<size_t>(type)] < stm[key][static_cast<size_t>(type)])
+            {
+                max[writeBuffer][static_cast<size_t>(upd.objType)][static_cast<size_t>(type)] = stm[key][static_cast<size_t>(type)];
+            }
+            if(max[writeBuffer][static_cast<size_t>(upd.objType)][static_cast<size_t>(ST::RENDER_COMBINED)] < stm[key][static_cast<size_t>(ST::RENDER_COMBINED)])
+            {
+                max[writeBuffer][static_cast<size_t>(upd.objType)][static_cast<size_t>(ST::RENDER_COMBINED)] = stm[key][static_cast<size_t>(ST::RENDER_COMBINED)];
+            }
+        }
+
        static inline void toggleBuffer()
        {
-            using ST = StatsEnum;
+            FSPlot("q size", static_cast<int64_t>(StatsRecorder::getInstance().q.size_approx()));
+            FSZone;
+            using ST = StatType_t;

-            // auto& statsMap = stats[writeBuffer];
-            // for(auto& stat_entry : statsMap)
-            // {
-            //     auto val = stat_entry.second[static_cast<size_t>(ST::RENDER_COMBINED)];
-            //     auto avg = stats[(writeBuffer+1)%2][stat_entry.first][static_cast<size_t>(ST::RENDER_COMBINED)];
-            //     stat_entry.second[static_cast<size_t>(ST::RENDER_COMBINED)] = avg + (val/500) - (avg/500);
-            // }
-            if(enabled())
-            {
-                writeBuffer = (writeBuffer+1)%2;
-            }; // note we are relying on atomic updates here. The risk is low and would cause minor errors in the stats display. 
-            auto& statsMap = stats[writeBuffer];
+            auto& statsMap = statsDoubleBuffer[writeBuffer][static_cast<size_t>(ObjType_t::OT_ATTACHMENT)];
            for(auto& stat_entry : statsMap)
            {
-                std::fill_n(stat_entry.second.begin() ,static_cast<size_t>(ST::STATS_COUNT),0);
+                auto val = stat_entry.second[static_cast<size_t>(ST::RENDER_COMBINED)];
+                auto avg = statsDoubleBuffer[writeBuffer ^ 1][static_cast<size_t>(ObjType_t::OT_ATTACHMENT)][stat_entry.first][static_cast<size_t>(ST::RENDER_COMBINED)];
+                stat_entry.second[static_cast<size_t>(ST::RENDER_COMBINED)] = avg + (val/100) - (avg/100);
+            }
+            if(enabled())
+            {
+                writeBuffer ^= 1;
+            }; // note we are relying on atomic updates here. The risk is low and would cause minor errors in the stats display. 
+            auto& statsTypeMatrix = statsDoubleBuffer[writeBuffer];
+            for(auto& statsMap : statsTypeMatrix)
+            {
+                FSZoneN("Clear stats maps");
+                for(auto& stat_entry : statsMap)
+                {
+                    std::fill_n(stat_entry.second.begin() ,static_cast<size_t>(ST::STATS_COUNT),0);
+                }
+                statsMap.clear();
+            }
+            for(int i=0; i< static_cast<size_t>(ObjType_t::OT_COUNT); i++)
+            {
+                FSZoneN("clear max/sum");
+                max[writeBuffer][i].fill(0);
+                sum[writeBuffer][i].fill(0);
            }
-            statsMap.clear();
-            std::fill_n(max[writeBuffer].begin(),static_cast<size_t>(ST::STATS_COUNT),0);
-            std::fill_n(sum[writeBuffer].begin(),static_cast<size_t>(ST::STATS_COUNT),0);
        }
-		static inline int getReadbufferIndex(){return (writeBuffer+1)%2;};
-        static inline StatsBlock& getCurrentStatsBuffer(){ return stats[(writeBuffer+1)%2]; }
-        static inline uint64_t getMax(StatsEnum type){return max[(writeBuffer+1)%2][static_cast<size_t>(type)];}
-        static inline uint64_t getSum(StatsEnum type){return sum[(writeBuffer+1)%2][static_cast<size_t>(type)];}
-        static inline uint64_t getNum(){return stats[(writeBuffer+1)%2].size();}
-        static inline uint64_t get(T key, StatsEnum type){return stats[(writeBuffer+1)%2][key][static_cast<size_t>(type)];}
-	};
-    template <typename T>
-    class RecordAttachmentTime
+
+
+
+        static void run()
+        {
+            StatsRecord upd;
+            auto& instance {StatsRecorder::getInstance()};
+            FSThreadName( "PerfStats" );
+
+            while( !LLApp::isExiting() )
+            {
+                if(instance.q.wait_dequeue_timed(upd, std::chrono::milliseconds(5)))
+                {
+                    instance.processUpdate(upd);
+                }
+            }
+        }
+
+        Queue q;
+        std::thread t;
+
+        ~StatsRecorder() = default;
+        StatsRecorder(const StatsRecorder&) = delete;
+        StatsRecorder& operator=(const StatsRecorder&) = delete;
+
+    };
+
+    // std::chrono::duration<double> getTime(){
+
+    // auto begin= std::chrono::system_clock::now();
+    // for ( size_t i= 0; i <= tenMill; ++i){
+    //     StatsRecorder::getInstance();
+    // }
+    // return std::chrono::system_clock::now() - begin;
+    
+    // };
+
+	template <enum ObjType_t ObjType>
+    class RecordTime
 	{
-        using StatsEnum = ObjStatType;
-        using StatsArray = std::array<uint64_t, static_cast<size_t>(StatsEnum::STATS_COUNT)>;
-        using StatsBlock = std::unordered_map<T, StatsArray>;

-		static int writeBuffer;
-        static std::array<StatsBlock,2> stats;
-
-		static std::array<StatsArray,2> max;
-		static std::array<StatsArray,2> sum;
-        static bool collectionEnabled;
-
-        RecordAttachmentTime(const RecordAttachmentTime&) = delete;
-        RecordAttachmentTime() = delete;
-        const T key;
-        const StatsEnum type;
-        std::chrono::steady_clock::time_point start;
+    private:
+        RecordTime(const RecordTime&) = delete;
+        RecordTime() = delete;
+        const StatType_t        type;
+        const decltype(ObjType) objType;
+        const LLUUID            objID;
+        U64 start;
+        RecordTime( StatType_t type ){};// 

    public:
+
        static inline void enable(){collectionEnabled=true;};
        static inline void disable(){collectionEnabled=false;};
        static inline bool enabled(){return(collectionEnabled);};

-        RecordAttachmentTime(T key, ObjStatType type):start{std::chrono::steady_clock::now()}, key{key}, type{type} {}

-        ~RecordAttachmentTime()
+        RecordTime( const LLUUID id, StatType_t type ):start{LLTimer::getCurrentClockCount()}, type{type}, objType{ObjType}, objID{id}{};
+
+        ~RecordTime()
        { 
-            using ST = StatsEnum;
-            // Note: nullptr is used as the key for global stats
-            auto val = std::chrono::duration<uint64_t, std::nano>(std::chrono::steady_clock::now() - start).count();
-            stats[writeBuffer][key][static_cast<size_t>(type)] += val;
-            stats[writeBuffer][key][static_cast<size_t>(ST::RENDER_COMBINED)] += val;
-            if(max[writeBuffer][static_cast<size_t>(type)] < stats[writeBuffer][key][static_cast<size_t>(type)])
-            {
-                max[writeBuffer][static_cast<size_t>(type)] = stats[writeBuffer][key][static_cast<size_t>(type)];
-            }
-            if(max[writeBuffer][static_cast<size_t>(ST::RENDER_COMBINED)] < stats[writeBuffer][key][static_cast<size_t>(ST::RENDER_COMBINED)])
-            {
-                max[writeBuffer][static_cast<size_t>(ST::RENDER_COMBINED)] = stats[writeBuffer][key][static_cast<size_t>(ST::RENDER_COMBINED)];
-            }
-            sum[writeBuffer][static_cast<size_t>(type)] += val;
-            sum[writeBuffer][static_cast<size_t>(ST::RENDER_COMBINED)] += val;
+            FSZoneC(tracy::Color::Red);
+            auto val = LLTimer::getCurrentClockCount() - start;
+            FSZoneValue(val);
+            FSZoneValue(static_cast<U64>(objType));
+            FSZoneText(objID.asString().c_str(), 36);
+            StatsRecord stat{type, objType, objID, val};
+            StatsRecorder::send(std::move(stat));
        };
-        static inline void toggleBuffer()
-        {
-            using ST = StatsEnum;
-            if(enabled())
-            {
-                writeBuffer = (writeBuffer+1)%2;
-            }; // note we are relying on atomic updates here. The risk is low and would cause minor errors in the stats display. 

-            auto& statsMap = stats[writeBuffer];
-            for(auto& stat_entry : statsMap)
-            {
-                std::fill_n(stat_entry.second.begin() ,static_cast<size_t>(ST::STATS_COUNT),0);
-            }
-            statsMap.clear();
-            std::fill_n(max[writeBuffer].begin(),static_cast<size_t>(ST::STATS_COUNT),0);
-            std::fill_n(sum[writeBuffer].begin(),static_cast<size_t>(ST::STATS_COUNT),0);
-        }
-		static inline int getReadbufferIndex(){return (writeBuffer+1)%2;};
-        static inline StatsBlock& getCurrentStatsBuffer(){ return stats[(writeBuffer+1)%2]; }
-        static inline uint64_t getMax(StatsEnum type){return max[(writeBuffer+1)%2][static_cast<size_t>(type)];}
-        static inline uint64_t getSum(StatsEnum type){return sum[(writeBuffer+1)%2][static_cast<size_t>(type)];}
-        static inline uint64_t getNum(){return stats[(writeBuffer+1)%2].size();}
-        static inline uint64_t get(T key, StatsEnum type){return stats[(writeBuffer+1)%2][key][static_cast<size_t>(type)];}
+
 	};
-    
-	static inline void toggleBuffer()
-    {
-        // RecordObjectTime<LLVOAvatar*>::toggleBuffer();
-        RecordSceneTime::toggleBuffer();
-    }

-    template< typename T >
-    int 	RecordObjectTime<T>::writeBuffer{0};
+    inline double raw_to_ns(U64 raw)    { return (static_cast<double>(raw) * 1000000000.0) * get_timer_info().mClockFrequencyInv; };
+    inline double raw_to_us(U64 raw)    { return (static_cast<double>(raw) *    1000000.0) * get_timer_info().mClockFrequencyInv; };
+    inline double raw_to_ms(U64 raw)    { return (static_cast<double>(raw) *       1000.0) * get_timer_info().mClockFrequencyInv; };

-    template< typename T >
-    bool 	RecordObjectTime<T>::collectionEnabled{true};
-
-	template< typename T >
-    std::array< typename RecordObjectTime< T >::StatsArray, 2 >	RecordObjectTime<T>::max;
-
-	template< typename T >
-    std::array< typename RecordObjectTime< T >::StatsArray, 2 >	RecordObjectTime<T>::sum;
-	
-    template< typename T >
-	std::array< typename RecordObjectTime< T >::StatsBlock, 2 > RecordObjectTime< T >::stats{ {{}} };
-
-    template< typename T >
-    int 	RecordAttachmentTime<T>::writeBuffer{0};
-
-    template< typename T >
-    bool 	RecordAttachmentTime<T>::collectionEnabled{true};
-
-	template< typename T >
-    std::array< typename RecordAttachmentTime< T >::StatsArray, 2 >	RecordAttachmentTime<T>::max;
-
-	template< typename T >
-    std::array< typename RecordAttachmentTime< T >::StatsArray, 2 >	RecordAttachmentTime<T>::sum;
-	
-    template< typename T >
-	std::array< typename RecordAttachmentTime< T >::StatsBlock, 2 > RecordAttachmentTime< T >::stats{ {{}} };
+    using RecordSceneTime = RecordTime<ObjType_t::OT_GENERAL>;
+    using RecordAvatarTime = RecordTime<ObjType_t::OT_AVATAR>;
+    using RecordAttachmentTime = RecordTime<ObjType_t::OT_ATTACHMENT>;
+     
 }// namespace FSPerfStats

-
-
-
-
+// <FS:Beq> helper function
+using RATptr = std::unique_ptr<FSPerfStats::RecordAttachmentTime>;
+template <typename T>
+static inline RATptr trackMyAttachment(const T * vobj)
+{
+	if( !vobj ){return nullptr;};
+	const T* rootAtt{vobj};
+	if( rootAtt->isAttachment() )
+	{
+	    FSZone;
+		while( !rootAtt->isRootEdit() )
+		{ 
+			rootAtt = (T*)(rootAtt->getParent());
+		}
+		if( ((T*)(rootAtt->getParent()))->getID() == gAgentID )
+		{
+            #if TRACY_ENABLE
+	        FSZoneNC( "trackMyAttachment:self", tracy::Color::Red );
+			auto& str = rootAtt->getAttachmentItemName();
+			FSZoneText(str.c_str(), str.size());
+			FSZoneText( rootAtt->getAttachmentItemID().asString().c_str(), 36);
+            #endif
+			return( std::make_unique<FSPerfStats::RecordAttachmentTime>( rootAtt->getAttachmentItemID(), FSPerfStats::StatType_t::RENDER_GEOMETRY) );
+		}
+	}
+	return nullptr;
+};
+// </FS:Beq>
 #endif
--- a/indra/llcommon/fstelemetry.h
+++ b/indra/llcommon/fstelemetry.h
@ -43,6 +43,8 @@
 #define FSZoneN( name ) ZoneNamedN( ___tracy_scoped_zone, name, FSTelemetry::active)
 #define FSZoneC( color ) ZoneNamedC( ___tracy_scoped_zone, color, FSTelemetry::active)
 #define FSZoneNC( name, color ) ZoneNamedNC( ___tracy_scoped_zone, name, color, FSTelemetry::active)
+#define FSZoneText( text, size ) ZoneText( text, size )
+#define FSZoneValue( num_uint64 ) ZoneValue( num_uint64 )
 #define FSPlot( name, value ) TracyPlot( name, value)
 #define FSFrameMark FrameMark
 #define FSThreadName( name ) tracy::SetThreadName( name )
@ -58,10 +60,12 @@
 #define FSZoneN( name ) 
 #define FSZoneC( color ) 
 #define FSZoneNC( name, color )
+#define FSZoneText( text, size )
+#define FSZoneValue( num_uint64 )
 #define FSPlot( name, value ) 
 #define FSFrameMark 
 #define FSThreadName( name ) 
-#define FSMessageL ( message )
+#define FSMessageL( message )
 #define FSTelemetryIsConnected
 #endif // TRACY_ENABLE

--- a/indra/llcommon/lightweightsemaphore.h
+++ b/indra/llcommon/lightweightsemaphore.h
@ -0,0 +1,411 @@
+// Provides an efficient implementation of a semaphore (LightweightSemaphore).
+// This is an extension of Jeff Preshing's sempahore implementation (licensed 
+// under the terms of its separate zlib license) that has been adapted and
+// extended by Cameron Desrochers.
+
+#pragma once
+
+#include <cstddef> // For std::size_t
+#include <atomic>
+#include <type_traits> // For std::make_signed<T>
+
+#if defined(_WIN32)
+// Avoid including windows.h in a header; we only need a handful of
+// items, so we'll redeclare them here (this is relatively safe since
+// the API generally has to remain stable between Windows versions).
+// I know this is an ugly hack but it still beats polluting the global
+// namespace with thousands of generic names or adding a .cpp for nothing.
+extern "C" {
+	struct _SECURITY_ATTRIBUTES;
+	__declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName);
+	__declspec(dllimport) int __stdcall CloseHandle(void* hObject);
+	__declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds);
+	__declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount);
+}
+#elif defined(__MACH__)
+#include <mach/mach.h>
+#elif defined(__unix__)
+#include <semaphore.h>
+#endif
+
+namespace moodycamel
+{
+namespace details
+{
+
+// Code in the mpmc_sema namespace below is an adaptation of Jeff Preshing's
+// portable + lightweight semaphore implementations, originally from
+// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h
+// LICENSE:
+// Copyright (c) 2015 Jeff Preshing
+//
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+//
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+//
+// 1. The origin of this software must not be misrepresented; you must not
+//	claim that you wrote the original software. If you use this software
+//	in a product, an acknowledgement in the product documentation would be
+//	appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//	misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+#if defined(_WIN32)
+class Semaphore
+{
+private:
+	void* m_hSema;
+	
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		const long maxLong = 0x7fffffff;
+		m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
+		assert(m_hSema);
+	}
+
+	~Semaphore()
+	{
+		CloseHandle(m_hSema);
+	}
+
+	bool wait()
+	{
+		const unsigned long infinite = 0xffffffff;
+		return WaitForSingleObject(m_hSema, infinite) == 0;
+	}
+	
+	bool try_wait()
+	{
+		return WaitForSingleObject(m_hSema, 0) == 0;
+	}
+	
+	bool timed_wait(std::uint64_t usecs)
+	{
+		return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) == 0;
+	}
+
+	void signal(int count = 1)
+	{
+		while (!ReleaseSemaphore(m_hSema, count, nullptr));
+	}
+};
+#elif defined(__MACH__)
+//---------------------------------------------------------
+// Semaphore (Apple iOS and OSX)
+// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
+//---------------------------------------------------------
+class Semaphore
+{
+private:
+	semaphore_t m_sema;
+
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		kern_return_t rc = semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
+		assert(rc == KERN_SUCCESS);
+		(void)rc;
+	}
+
+	~Semaphore()
+	{
+		semaphore_destroy(mach_task_self(), m_sema);
+	}
+
+	bool wait()
+	{
+		return semaphore_wait(m_sema) == KERN_SUCCESS;
+	}
+	
+	bool try_wait()
+	{
+		return timed_wait(0);
+	}
+	
+	bool timed_wait(std::uint64_t timeout_usecs)
+	{
+		mach_timespec_t ts;
+		ts.tv_sec = static_cast<unsigned int>(timeout_usecs / 1000000);
+		ts.tv_nsec = static_cast<int>((timeout_usecs % 1000000) * 1000);
+
+		// added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html
+		kern_return_t rc = semaphore_timedwait(m_sema, ts);
+		return rc == KERN_SUCCESS;
+	}
+
+	void signal()
+	{
+		while (semaphore_signal(m_sema) != KERN_SUCCESS);
+	}
+
+	void signal(int count)
+	{
+		while (count-- > 0)
+		{
+			while (semaphore_signal(m_sema) != KERN_SUCCESS);
+		}
+	}
+};
+#elif defined(__unix__)
+//---------------------------------------------------------
+// Semaphore (POSIX, Linux)
+//---------------------------------------------------------
+class Semaphore
+{
+private:
+	sem_t m_sema;
+
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		int rc = sem_init(&m_sema, 0, static_cast<unsigned int>(initialCount));
+		assert(rc == 0);
+		(void)rc;
+	}
+
+	~Semaphore()
+	{
+		sem_destroy(&m_sema);
+	}
+
+	bool wait()
+	{
+		// http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
+		int rc;
+		do {
+			rc = sem_wait(&m_sema);
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	bool try_wait()
+	{
+		int rc;
+		do {
+			rc = sem_trywait(&m_sema);
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	bool timed_wait(std::uint64_t usecs)
+	{
+		struct timespec ts;
+		const int usecs_in_1_sec = 1000000;
+		const int nsecs_in_1_sec = 1000000000;
+		clock_gettime(CLOCK_REALTIME, &ts);
+		ts.tv_sec += (time_t)(usecs / usecs_in_1_sec);
+		ts.tv_nsec += (long)(usecs % usecs_in_1_sec) * 1000;
+		// sem_timedwait bombs if you have more than 1e9 in tv_nsec
+		// so we have to clean things up before passing it in
+		if (ts.tv_nsec >= nsecs_in_1_sec) {
+			ts.tv_nsec -= nsecs_in_1_sec;
+			++ts.tv_sec;
+		}
+
+		int rc;
+		do {
+			rc = sem_timedwait(&m_sema, &ts);
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	void signal()
+	{
+		while (sem_post(&m_sema) == -1);
+	}
+
+	void signal(int count)
+	{
+		while (count-- > 0)
+		{
+			while (sem_post(&m_sema) == -1);
+		}
+	}
+};
+#else
+#error Unsupported platform! (No semaphore wrapper available)
+#endif
+
+}	// end namespace details
+
+
+//---------------------------------------------------------
+// LightweightSemaphore
+//---------------------------------------------------------
+class LightweightSemaphore
+{
+public:
+	typedef std::make_signed<std::size_t>::type ssize_t;
+
+private:
+	std::atomic<ssize_t> m_count;
+	details::Semaphore m_sema;
+	int m_maxSpins;
+
+	bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1)
+	{
+		ssize_t oldCount;
+		int spin = m_maxSpins;
+		while (--spin >= 0)
+		{
+			oldCount = m_count.load(std::memory_order_relaxed);
+			if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed))
+				return true;
+			std::atomic_signal_fence(std::memory_order_acquire);	 // Prevent the compiler from collapsing the loop.
+		}
+		oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+		if (oldCount > 0)
+			return true;
+		if (timeout_usecs < 0)
+		{
+			if (m_sema.wait())
+				return true;
+		}
+		if (timeout_usecs > 0 && m_sema.timed_wait((std::uint64_t)timeout_usecs))
+			return true;
+		// At this point, we've timed out waiting for the semaphore, but the
+		// count is still decremented indicating we may still be waiting on
+		// it. So we have to re-adjust the count, but only if the semaphore
+		// wasn't signaled enough times for us too since then. If it was, we
+		// need to release the semaphore too.
+		while (true)
+		{
+			oldCount = m_count.load(std::memory_order_acquire);
+			if (oldCount >= 0 && m_sema.try_wait())
+				return true;
+			if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed))
+				return false;
+		}
+	}
+
+	ssize_t waitManyWithPartialSpinning(ssize_t max, std::int64_t timeout_usecs = -1)
+	{
+		assert(max > 0);
+		ssize_t oldCount;
+		int spin = m_maxSpins;
+		while (--spin >= 0)
+		{
+			oldCount = m_count.load(std::memory_order_relaxed);
+			if (oldCount > 0)
+			{
+				ssize_t newCount = oldCount > max ? oldCount - max : 0;
+				if (m_count.compare_exchange_strong(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed))
+					return oldCount - newCount;
+			}
+			std::atomic_signal_fence(std::memory_order_acquire);
+		}
+		oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+		if (oldCount <= 0)
+		{
+			if ((timeout_usecs == 0) || (timeout_usecs < 0 && !m_sema.wait()) || (timeout_usecs > 0 && !m_sema.timed_wait((std::uint64_t)timeout_usecs)))
+			{
+				while (true)
+				{
+					oldCount = m_count.load(std::memory_order_acquire);
+					if (oldCount >= 0 && m_sema.try_wait())
+						break;
+					if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed))
+						return 0;
+				}
+			}
+		}
+		if (max > 1)
+			return 1 + tryWaitMany(max - 1);
+		return 1;
+	}
+
+public:
+	LightweightSemaphore(ssize_t initialCount = 0, int maxSpins = 10000) : m_count(initialCount), m_maxSpins(maxSpins)
+	{
+		assert(initialCount >= 0);
+		assert(maxSpins >= 0);
+	}
+
+	bool tryWait()
+	{
+		ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+		while (oldCount > 0)
+		{
+			if (m_count.compare_exchange_weak(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed))
+				return true;
+		}
+		return false;
+	}
+
+	bool wait()
+	{
+		return tryWait() || waitWithPartialSpinning();
+	}
+
+	bool wait(std::int64_t timeout_usecs)
+	{
+		return tryWait() || waitWithPartialSpinning(timeout_usecs);
+	}
+
+	// Acquires between 0 and (greedily) max, inclusive
+	ssize_t tryWaitMany(ssize_t max)
+	{
+		assert(max >= 0);
+		ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+		while (oldCount > 0)
+		{
+			ssize_t newCount = oldCount > max ? oldCount - max : 0;
+			if (m_count.compare_exchange_weak(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed))
+				return oldCount - newCount;
+		}
+		return 0;
+	}
+
+	// Acquires at least one, and (greedily) at most max
+	ssize_t waitMany(ssize_t max, std::int64_t timeout_usecs)
+	{
+		assert(max >= 0);
+		ssize_t result = tryWaitMany(max);
+		if (result == 0 && max > 0)
+			result = waitManyWithPartialSpinning(max, timeout_usecs);
+		return result;
+	}
+	
+	ssize_t waitMany(ssize_t max)
+	{
+		ssize_t result = waitMany(max, -1);
+		assert(result > 0);
+		return result;
+	}
+
+	void signal(ssize_t count = 1)
+	{
+		assert(count >= 0);
+		ssize_t oldCount = m_count.fetch_add(count, std::memory_order_release);
+		ssize_t toRelease = -oldCount < count ? -oldCount : count;
+		if (toRelease > 0)
+		{
+			m_sema.signal((int)toRelease);
+		}
+	}
+	
+	std::size_t availableApprox() const
+	{
+		ssize_t count = m_count.load(std::memory_order_relaxed);
+		return count > 0 ? static_cast<std::size_t>(count) : 0;
+	}
+};
+
+}   // end namespace moodycamel
--- a/indra/newview/llappviewer.cpp
+++ b/indra/newview/llappviewer.cpp
@ -1633,7 +1633,7 @@ bool LLAppViewer::frame()
 bool LLAppViewer::doFrame()
 {
 	{
-	FSPerfStats::RecordSceneTime T (FSPerfStats::SceneStatType::RENDER_FRAME);
+	FSPerfStats::RecordSceneTime T (const LLUUID{}, FSPerfStats::StatType_t::RENDER_FRAME);

 	LLEventPump& mainloop(LLEventPumps::instance().obtain("mainloop"));
 	LLSD newFrame;
@ -1771,7 +1771,7 @@ bool LLAppViewer::doFrame()

 			// Update state based on messages, user input, object idle.
 			{
-				FSPerfStats::RecordSceneTime T (FSPerfStats::SceneStatType::RENDER_IDLE);
+				FSPerfStats::RecordSceneTime T (const LLUUID{}, FSPerfStats::StatType_t::RENDER_IDLE);

 				pauseMainloopTimeout(); // *TODO: Remove. Messages shouldn't be stalling for 20+ seconds!

@ -1851,7 +1851,7 @@ bool LLAppViewer::doFrame()
 				// of equal priority on Windows
 				if (milliseconds_to_sleep > 0)
 				{
-					FSPerfStats::RecordSceneTime T ( FSPerfStats::SceneStatType::RENDER_SLEEP );
+					FSPerfStats::RecordSceneTime T ( LLUUID{}, FSPerfStats::StatType_t::RENDER_SLEEP );
 					ms_sleep(milliseconds_to_sleep);
 					// also pause worker threads during this wait period
 					LLAppViewer::getTextureCache()->pause();
@ -1929,7 +1929,7 @@ bool LLAppViewer::doFrame()
 			if (fsLimitFramerate && LLStartUp::getStartupState() == STATE_STARTED && !gTeleportDisplay && !logoutRequestSent() && max_fps > F_APPROXIMATELY_ZERO)
 			{
 				// Sleep a while to limit frame rate.
-				FSPerfStats::RecordSceneTime T (FSPerfStats::SceneStatType::RENDER_FPSLIMIT);
+				FSPerfStats::RecordSceneTime T (const LLUUID{}, FSPerfStats::StatType_t::RENDER_FPSLIMIT);
 				F32 min_frame_time = 1.f / (F32)max_fps;
 				S32 milliseconds_to_sleep = llclamp((S32)((min_frame_time - frameTimer.getElapsedTimeF64()) * 1000.f), 0, 1000);
 				if (milliseconds_to_sleep > 0)
@ -1970,9 +1970,7 @@ bool LLAppViewer::doFrame()
    FSFrameMark; // <FS:Beq> Tracy support delineate Frame
    LLPROFILE_UPDATE();
 	}
-	FSPerfStats::RecordSceneTime::toggleBuffer();
-	FSPerfStats::RecordObjectTime<const LLVOAvatar*>::toggleBuffer();
-	FSPerfStats::RecordAttachmentTime<U32>::toggleBuffer();
+	FSPerfStats::StatsRecorder::endFrame();

 	return ! LLApp::isRunning();
 }
--- a/indra/newview/lldrawpool.cpp
+++ b/indra/newview/lldrawpool.cpp
@ -453,24 +453,15 @@ void LLRenderPass::applyModelMatrix(const LLDrawInfo& params)

 void LLRenderPass::pushBatch(LLDrawInfo& params, U32 mask, BOOL texture, BOOL batch_textures)
 {
+	FSZone;
 	// <FS:Beq> Capture render times
-	LLViewerObject* rootAtt{};
-	std::unique_ptr<FSPerfStats::RecordAttachmentTime<U32>> T{};
+	std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
 	if(params.mFace)
 	{
-		LLViewerObject* vobj = (LLViewerObject *)params.mFace->getViewerObject();
-		
+		LLViewerObject* vobj = params.mFace->getViewerObject();
 		if(vobj->isAttachment())
 		{
-			auto par = (LLViewerObject*)vobj->getParent();
-			rootAtt = vobj;
-			while( par->isAttachment() )
-			{
-				rootAtt = par;
-				par = (LLViewerObject*)par->getParent();
-			}
-			LL_INFOS() << "pushBatch recording time for ATT@" << rootAtt << " " << (rootAtt?rootAtt->getAttachmentItemName():"null") << " as " << rootAtt->getAttachmentItemID().getCRC32() << LL_ENDL;
-			if(rootAtt){T = std::unique_ptr<FSPerfStats::RecordAttachmentTime<U32>>(new FSPerfStats::RecordAttachmentTime<U32>(rootAtt->getAttachmentItemID().getCRC32(), FSPerfStats::ObjStatType::RENDER_GEOMETRY));}
+			T = trackMyAttachment( vobj );
 		}
 	}
 	// </FS:Beq>
--- a/indra/newview/lldrawpoolalpha.cpp
+++ b/indra/newview/lldrawpoolalpha.cpp
@ -342,6 +342,7 @@ void LLDrawPoolAlpha::render(S32 pass)

 void LLDrawPoolAlpha::renderAlphaHighlight(U32 mask)
 {
+	FSZone;
 	for (LLCullResult::sg_iterator i = gPipeline.beginAlphaGroups(); i != gPipeline.endAlphaGroups(); ++i)
 	{
 		LLSpatialGroup* group = *i;
@ -354,23 +355,13 @@ void LLDrawPoolAlpha::renderAlphaHighlight(U32 mask)
 			{
 				LLDrawInfo& params = **k;
 				// <FS:Beq> Capture render times
-				std::unique_ptr<FSPerfStats::RecordAttachmentTime<U32>> T{};
+				std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
 				if(params.mFace)
 				{
-					LLViewerObject* rootAtt{};
 					LLViewerObject* vobj = (LLViewerObject *)params.mFace->getViewerObject();
-					
 					if(vobj->isAttachment())
 					{
-						auto par = (LLViewerObject*)vobj->getParent();
-						rootAtt = vobj;
-						while( par->isAttachment() )
-						{
-							rootAtt = par;
-							par = (LLViewerObject*)par->getParent();
-						}
-						LL_INFOS() << "recording time for ATT@" << rootAtt << " " << (rootAtt?rootAtt->getAttachmentItemName():"null") << " as " << rootAtt->getAttachmentItemID().getCRC32() << LL_ENDL;
-						if(rootAtt){T = std::unique_ptr<FSPerfStats::RecordAttachmentTime<U32>>(new FSPerfStats::RecordAttachmentTime<U32>(rootAtt->getAttachmentItemID().getCRC32(), FSPerfStats::ObjStatType::RENDER_GEOMETRY));}
+						T = trackMyAttachment(vobj);					
 					}
 				}
 				// </FS:Beq>
@ -499,6 +490,7 @@ void LLDrawPoolAlpha::RestoreTexSetup(bool tex_setup)

 void LLDrawPoolAlpha::renderSimples(U32 mask, std::vector<LLDrawInfo*>& simples)
 {
+	FSZone;
    gPipeline.enableLightsDynamic();
    simple_shader->bind();
 	simple_shader->bindTexture(LLShaderMgr::BUMP_MAP, LLViewerFetchedTexture::sFlatNormalImagep);
@ -509,6 +501,15 @@ void LLDrawPoolAlpha::renderSimples(U32 mask, std::vector<LLDrawInfo*>& simples)
    bool use_shaders = gPipeline.canUseVertexShaders();
    for (LLDrawInfo* draw : simples)
    {
+		// <FS:Beq> Capture render times
+		FSZoneN("Simples");
+		std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
+		auto vobj = draw->mFace?draw->mFace->getViewerObject():nullptr;
+		if(vobj && vobj->isAttachment())
+		{
+			T = trackMyAttachment(vobj);
+		}
+		// </FS:Beq>
        bool tex_setup = TexSetup(draw, use_shaders, false, simple_shader);
        LLGLEnableFunc stencil_test(GL_STENCIL_TEST, draw->mSelected, &LLGLCommonFunc::selected_stencil_test);
 		gGL.blendFunc((LLRender::eBlendFactor) draw->mBlendFuncSrc, (LLRender::eBlendFactor) draw->mBlendFuncDst, mAlphaSFactor, mAlphaDFactor);
@ -527,6 +528,15 @@ void LLDrawPoolAlpha::renderFullbrights(U32 mask, std::vector<LLDrawInfo*>& full
    bool use_shaders = gPipeline.canUseVertexShaders();
    for (LLDrawInfo* draw : fullbrights)
    {
+		// <FS:Beq> Capture render times
+		FSZoneN("Fullbrights");
+		std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
+		auto vobj = draw->mFace?draw->mFace->getViewerObject():nullptr;
+		if(vobj && vobj->isAttachment())
+		{
+			T = trackMyAttachment(vobj);
+		}
+		// </FS:Beq>
        bool tex_setup = TexSetup(draw, use_shaders, false, fullbright_shader);

        LLGLEnableFunc stencil_test(GL_STENCIL_TEST, draw->mSelected, &LLGLCommonFunc::selected_stencil_test);
@ -547,6 +557,16 @@ void LLDrawPoolAlpha::renderMaterials(U32 mask, std::vector<LLDrawInfo*>& materi
    bool use_shaders = gPipeline.canUseVertexShaders();
    for (LLDrawInfo* draw : materials)
    {
+		// <FS:Beq> Capture render times
+		FSZoneN("Materials");
+		std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
+		auto vobj = draw->mFace?draw->mFace->getViewerObject():nullptr;
+		if(vobj && vobj->isAttachment())
+		{
+			T = trackMyAttachment(vobj);
+		}
+		// </FS:Beq>
+
        U32 mask = draw->mShaderMask;

 		llassert(mask < LLMaterial::SHADER_COUNT);
@ -629,6 +649,16 @@ void LLDrawPoolAlpha::renderEmissives(U32 mask, std::vector<LLDrawInfo*>& emissi
    bool use_shaders = gPipeline.canUseVertexShaders();
    for (LLDrawInfo* draw : emissives)
    {
+		// <FS:Beq> Capture render times
+		FSZoneN("Emissives");
+		std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
+		auto vobj = draw->mFace?draw->mFace->getViewerObject():nullptr;
+		if(vobj && vobj->isAttachment())
+		{
+			T = trackMyAttachment(vobj);
+		}
+		// </FS:Beq>
+
        bool tex_setup = TexSetup(draw, use_shaders, false, emissive_shader);
        drawEmissive(mask, draw);
        RestoreTexSetup(tex_setup);
@ -702,23 +732,14 @@ void LLDrawPoolAlpha::renderAlpha(U32 mask, S32 pass)
 				}

 				// <FS:Beq> Capture render times
-				std::unique_ptr<FSPerfStats::RecordAttachmentTime<U32>> T{};
+				std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
 				if(params.mFace)
 				{
-					LLViewerObject* rootAtt{};
 					LLViewerObject* vobj = (LLViewerObject *)params.mFace->getViewerObject();
 					
 					if(vobj->isAttachment())
 					{
-						auto par = (LLViewerObject*)vobj->getParent();
-						rootAtt = vobj;
-						while( par->isAttachment() )
-						{
-							rootAtt = par;
-							par = (LLViewerObject*)par->getParent();
-						}
-						LL_INFOS() << "ALPHA recording time for ATT@" << rootAtt << " " << (rootAtt?rootAtt->getAttachmentItemName():"null") << " as " << rootAtt->getAttachmentItemID().getCRC32() << LL_ENDL;
-						if(rootAtt){T = std::unique_ptr<FSPerfStats::RecordAttachmentTime<U32>>(new FSPerfStats::RecordAttachmentTime<U32>(rootAtt->getAttachmentItemID().getCRC32(), FSPerfStats::ObjStatType::RENDER_GEOMETRY));}
+						T = trackMyAttachment(vobj);
 					}
 				}
 				// </FS:Beq>
--- a/indra/newview/lldrawpoolavatar.cpp
+++ b/indra/newview/lldrawpoolavatar.cpp
@ -581,7 +581,7 @@ void LLDrawPoolAvatar::renderShadow(S32 pass)
 	{
 		return;
 	}
-	FSPerfStats::RecordObjectTime<const LLVOAvatar*> T(avatarp, FSPerfStats::ObjStatType::RENDER_SHADOWS);
+	FSPerfStats::RecordAvatarTime T(avatarp->getID(), FSPerfStats::StatType_t::RENDER_SHADOWS);

 	LLVOAvatar::AvatarOverallAppearance oa = avatarp->getOverallAppearance();
 	BOOL impostor = !LLPipeline::sImpostorRender && avatarp->isImpostor();
@ -1504,7 +1504,7 @@ void LLDrawPoolAvatar::renderAvatars(LLVOAvatar* single_avatar, S32 pass)
 	{
 		return;
 	}
-	FSPerfStats::RecordObjectTime<const LLVOAvatar*> T(avatarp, FSPerfStats::ObjStatType::RENDER_GEOMETRY);
+	FSPerfStats::RecordAvatarTime T(avatarp->getID(), FSPerfStats::StatType_t::RENDER_GEOMETRY);

 	// <FS:Zi> Add avatar hitbox debug
 	static LLCachedControl<bool> render_hitbox(gSavedSettings, "DebugRenderHitboxes", false);
@ -2282,21 +2282,11 @@ void LLDrawPoolAvatar::renderRigged(LLVOAvatar* avatar, U32 type, bool glow)
 		}
 	
 		auto self = avatar->isSelf();
-		LLViewerObject * parentAttachment{nullptr};
+		std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
 		if(self && vobj->isAttachment())
 		{
-			LLViewerObject * vtop = vobj;
-			LLViewerObject * par  = (LLViewerObject *) vobj->getParent();
-
-			while (par && !(par->asAvatar()))
-			{
-				vtop = par;
-				par = (LLViewerObject *)vtop->getParent();
-			}
-			parentAttachment = vtop;
+			T = trackMyAttachment(vobj);
 		}
-		FSPerfStats::RecordAttachmentTime<U32> T(parentAttachment?parentAttachment->getAttachmentItemID().getCRC32():0, FSPerfStats::ObjStatType::RENDER_GEOMETRY);
-
 		
 		LLVolume* volume = vobj->getVolume();
 		S32 te = face->getTEOffset();
@ -2612,7 +2602,13 @@ void LLDrawPoolAvatar::updateRiggedVertexBuffers(LLVOAvatar* avatar)
 			{
 				continue;
 			}
-
+			// <FS:Beq> Capture render times
+			std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};			
+			if(vobj->isAttachment())
+			{
+				T = trackMyAttachment(vobj);
+			}
+			// </FS:Beq>
 			LLVolume* volume = vobj->getVolume();
 			S32 te = face->getTEOffset();

--- a/indra/newview/lldrawpoolbump.cpp
+++ b/indra/newview/lldrawpoolbump.cpp
@ -641,28 +641,20 @@ void LLDrawPoolBump::endFullbrightShiny()
 }

 void LLDrawPoolBump::renderGroup(LLSpatialGroup* group, U32 type, U32 mask, BOOL texture = TRUE)
-{					
+{			
+	FSZone;		
 	LLSpatialGroup::drawmap_elem_t& draw_info = group->mDrawMap[type];	
 	
 	for (LLSpatialGroup::drawmap_elem_t::iterator k = draw_info.begin(); k != draw_info.end(); ++k) 
 	{
 		LLDrawInfo& params = **k;
 		// <FS:Beq> Capture render times
-		LLViewerObject* rootAtt{};
-		std::unique_ptr<FSPerfStats::RecordAttachmentTime<U32>> T{};
+		std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
 		LLViewerObject* vobj = (LLViewerObject *)params.mFace->getViewerObject();
 		
 		if(vobj->isAttachment())
 		{
-			auto par = (LLViewerObject*)vobj->getParent();
-			rootAtt = vobj;
-			while( par->isAttachment() )
-			{
-				rootAtt = par;
-				par = (LLViewerObject*)par->getParent();
-			}
-			LL_INFOS() << "recording time for ATT@" << rootAtt << " " << (rootAtt?rootAtt->getAttachmentItemName():"null") << " as " << rootAtt->getAttachmentItemID().getCRC32() << LL_ENDL;
-			if(rootAtt){T = std::unique_ptr<FSPerfStats::RecordAttachmentTime<U32>>(new FSPerfStats::RecordAttachmentTime<U32>(rootAtt->getAttachmentItemID().getCRC32(), FSPerfStats::ObjStatType::RENDER_GEOMETRY));}
+			T= trackMyAttachment(vobj);
 		}
 		// </FS:Beq>		
 		applyModelMatrix(params);
@ -1530,24 +1522,16 @@ void LLDrawPoolBump::renderBump(U32 type, U32 mask)

 void LLDrawPoolBump::pushBatch(LLDrawInfo& params, U32 mask, BOOL texture, BOOL batch_textures)
 {
+	FSZone;
 	// <FS:Beq> Capture render times
-	std::unique_ptr<FSPerfStats::RecordAttachmentTime<U32>> T{};
+	std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
 	if(params.mFace)
 	{
-		LLViewerObject* rootAtt{};
 		LLViewerObject* vobj = (LLViewerObject *)params.mFace->getViewerObject();
 		
 		if(vobj->isAttachment())
 		{
-			auto par = (LLViewerObject*)vobj->getParent();
-			rootAtt = vobj;
-			while( par->isAttachment() )
-			{
-				rootAtt = par;
-				par = (LLViewerObject*)par->getParent();
-			}
-			// LL_INFOS() << "recording time for ATT@" << rootAtt << " " << (rootAtt?rootAtt->getAttachmentItemName():"null") << " as " << rootAtt->getAttachmentItemID().getCRC32() << LL_ENDL;
-			if(rootAtt){T = std::unique_ptr<FSPerfStats::RecordAttachmentTime<U32>>(new FSPerfStats::RecordAttachmentTime<U32>(rootAtt->getAttachmentItemID().getCRC32(), FSPerfStats::ObjStatType::RENDER_GEOMETRY));}
+			T = trackMyAttachment(vobj);
 		}
 	}
 	// </FS:Beq>
--- a/indra/newview/lldrawpoolmaterials.cpp
+++ b/indra/newview/lldrawpoolmaterials.cpp
@ -141,23 +141,14 @@ void LLDrawPoolMaterials::renderDeferred(S32 pass)
 		LLDrawInfo& params = **i;

 		// <FS:Beq> Capture render times
-		std::unique_ptr<FSPerfStats::RecordAttachmentTime<U32>> T{};
+		std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
 		if(params.mFace)
 		{
-			LLViewerObject* rootAtt{};
 			LLViewerObject* vobj = (LLViewerObject *)params.mFace->getViewerObject();
 			
 			if(vobj->isAttachment())
 			{
-				auto par = (LLViewerObject*)vobj->getParent();
-				rootAtt = vobj;
-				while( par->isAttachment() )
-				{
-					rootAtt = par;
-					par = (LLViewerObject*)par->getParent();
-				}
-				LL_INFOS() << "MATERIALS recording time for ATT@" << rootAtt << " " << (rootAtt?rootAtt->getAttachmentItemName():"null") << " as " << rootAtt->getAttachmentItemID().getCRC32() << LL_ENDL;
-				if(rootAtt){T = std::unique_ptr<FSPerfStats::RecordAttachmentTime<U32>>(new FSPerfStats::RecordAttachmentTime<U32>(rootAtt->getAttachmentItemID().getCRC32(), FSPerfStats::ObjStatType::RENDER_GEOMETRY));}
+				T = trackMyAttachment(vobj);
 			}
 		}
 		// </FS:Beq>
@ -196,6 +187,18 @@ void LLDrawPoolMaterials::bindNormalMap(LLViewerTexture* tex)

 void LLDrawPoolMaterials::pushBatch(LLDrawInfo& params, U32 mask, BOOL texture, BOOL batch_textures)
 {
+	// <FS:Beq> Capture render times
+	std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
+	if(params.mFace)
+	{
+		LLViewerObject* vobj = (LLViewerObject *)params.mFace->getViewerObject();
+		
+		if(vobj->isAttachment())
+		{
+			T = trackMyAttachment(vobj);
+		}
+	}
+	// </FS:Beq>
 	applyModelMatrix(params);
 	
 	bool tex_setup = false;
--- a/indra/newview/lldrawpoolsimple.cpp
+++ b/indra/newview/lldrawpoolsimple.cpp
@ -36,6 +36,7 @@
 #include "llspatialpartition.h"
 #include "llviewershadermgr.h"
 #include "llrender.h"
+#include "fsperfstats.h"

 static LLGLSLShader* simple_shader = NULL;
 static LLGLSLShader* fullbright_shader = NULL;
@ -152,6 +153,18 @@ void LLDrawPoolGlow::render(S32 pass)

 void LLDrawPoolGlow::pushBatch(LLDrawInfo& params, U32 mask, BOOL texture, BOOL batch_textures)
 {
+	// <FS:Beq> Capture render times
+	std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
+	if(params.mFace)
+	{
+		LLViewerObject* vobj = (LLViewerObject *)params.mFace->getViewerObject();
+		
+		if(vobj->isAttachment())
+		{
+			T = trackMyAttachment(vobj);
+		}
+	}
+	// </FS:Beq>
 	//gGL.diffuseColor4ubv(params.mGlowColor.mV);
 	LLRenderPass::pushBatch(params, mask, texture, batch_textures);
 }
--- a/indra/newview/lldynamictexture.cpp
+++ b/indra/newview/lldynamictexture.cpp
@ -228,12 +228,15 @@ BOOL LLViewerDynamicTexture::updateAllInstances()
 	BOOL ret = FALSE ;
 	for( S32 order = 0; order < ORDER_COUNT; order++ )
 	{
+		FSZone;
 		for (instance_list_t::iterator iter = LLViewerDynamicTexture::sInstances[order].begin();
 			 iter != LLViewerDynamicTexture::sInstances[order].end(); ++iter)
 		{
+			FSZone;
 			LLViewerDynamicTexture *dynamicTexture = *iter;
 			if (dynamicTexture->needsRender())
-			{				
+			{		
+				FSZoneN("needsRender");		
 				glClear(GL_DEPTH_BUFFER_BIT);
 				gDepthDirty = TRUE;
 								
@ -241,13 +244,19 @@ BOOL LLViewerDynamicTexture::updateAllInstances()
                dynamicTexture->setBoundTarget(use_fbo ? &gPipeline.mBake : nullptr);
 				dynamicTexture->preRender();	// Must be called outside of startRender()
 				result = FALSE;
+				{
+					FSZoneN("DynTexture->render");
 				if (dynamicTexture->render())
 				{
 					ret = TRUE ;
 					result = TRUE;
 					sNumRenders++;
 				}
+				}
+				{
+					FSZoneN("flush");
 				gGL.flush();
+				}
 				LLVertexBuffer::unbind();
 				dynamicTexture->setBoundTarget(nullptr);
 				dynamicTexture->postRender(result);
--- a/indra/newview/llface.cpp
+++ b/indra/newview/llface.cpp
@ -643,19 +643,10 @@ void renderFace(LLDrawable* drawable, LLFace *face)
    LLVOVolume* vobj = drawable->getVOVolume();
    if (vobj)
    {
-		LLVOVolume* rootAtt{};
-		std::unique_ptr<FSPerfStats::RecordAttachmentTime<U32>> T{};
+		std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
 		if(vobj->isAttachment())
 		{
-			auto par = (LLVOVolume*)vobj->getParent();
-			rootAtt = vobj;
-			while( par->isAttachment() )
-			{
-				rootAtt = par;
-				par = (LLVOVolume*)par->getParent();
-			}
-			// LL_INFOS() << "recording time for ATT@" << rootAtt << " " << (rootAtt?rootAtt->getAttachmentItemName():"null") << LL_ENDL;
-			if(rootAtt){T = std::unique_ptr<FSPerfStats::RecordAttachmentTime<U32>>(new FSPerfStats::RecordAttachmentTime<U32>(rootAtt->getAttachmentItemID().getCRC32(), FSPerfStats::ObjStatType::RENDER_GEOMETRY));}
+			T = trackMyAttachment(vobj);
 		}
        LLVolume* volume = NULL;

@ -1255,6 +1246,11 @@ bool LLFace::canRenderAsMask()
 	{
 		return false;
 	}
+
+	// <FS:Beq> shortcircuit fully alpha faces
+	if(getViewerObject()->isHUDAttachment()){return false;};
+	if(te->getAlpha() == 0.0f && (te->getGlow() == 0.f)){FSZoneN("beqshortcircuit invisible");return true;}
+	// </FS:Beq>
 	
 	LLMaterial* mat = te->getMaterialParams();
 	if (mat && mat->getDiffuseAlphaMode() == LLMaterial::DIFFUSE_ALPHA_MODE_BLEND)
--- a/indra/newview/llfloaterperformance.cpp
+++ b/indra/newview/llfloaterperformance.cpp
@ -53,6 +53,10 @@ const S32 BAR_LEFT_PAD = 2;
 const S32 BAR_RIGHT_PAD = 5;
 const S32 BAR_BOTTOM_PAD = 9;

+constexpr auto AvType       {FSPerfStats::ObjType_t::OT_AVATAR};
+constexpr auto AttType      {FSPerfStats::ObjType_t::OT_ATTACHMENT};
+constexpr auto HudType      {FSPerfStats::ObjType_t::OT_HUD};
+constexpr auto SceneType    {FSPerfStats::ObjType_t::OT_GENERAL};
 class LLExceptionsContextMenu : public LLListContextMenu
 {
 public:
@ -164,44 +168,58 @@ void LLFloaterPerformance::showSelectedPanel(LLPanel* selected_panel)
 void LLFloaterPerformance::draw()
 {
    const S32 NUM_PERIODS = 50;
+    constexpr auto NANOS = 1000000000;
+    constexpr auto MICROS = 1000000;
+    constexpr auto MILLIS = 1000;
+    
+
    static LLCachedControl<U32> fps_cap(gSavedSettings, "FramePerSecondLimit"); // user limited FPS
    static LLCachedControl<U32> target_fps(gSavedSettings, "FSTargetFPS"); // desired FPS
    static LLCachedControl<bool> auto_tune(gSavedSettings, "FSAutoTuneFPS"); // auto tune enabled?
    static LLCachedControl<F32> max_render_cost(gSavedSettings, "RenderAvatarMaxART", 0);    

+    static auto freq_divisor = get_timer_info().mClockFrequencyInv;
    if (mUpdateTimer->hasExpired())
    {
       	LLStringUtil::format_map_t args;

        auto fps = LLTrace::get_frame_recording().getPeriodMedianPerSec(LLStatViewer::FPS, NUM_PERIODS);
        getChild<LLTextBox>("fps_value")->setValue((S32)llround(fps));
-        auto tot_frame_time_ns = 1000000000/fps;
-        auto target_frame_time_ns = 1000000000/(target_fps==0?1:target_fps);
-        auto tot_avatar_time = FSPerfStats::RecordObjectTime<const LLVOAvatar*>::getSum(FSPerfStats::ObjStatType::RENDER_COMBINED);
-        auto tot_huds_time = FSPerfStats::RecordSceneTime::get(FSPerfStats::SceneStatType::RENDER_HUDS) ;
-        auto tot_sleep_time = FSPerfStats::RecordSceneTime::get(FSPerfStats::SceneStatType::RENDER_SLEEP);
-        auto tot_ui_time = FSPerfStats::RecordSceneTime::get(FSPerfStats::SceneStatType::RENDER_UI);
-        auto tot_idle_time = FSPerfStats::RecordSceneTime::get(FSPerfStats::SceneStatType::RENDER_IDLE);
-        auto tot_limit_time = FSPerfStats::RecordSceneTime::get(FSPerfStats::SceneStatType::RENDER_FPSLIMIT);
-        auto tot_swap_time = FSPerfStats::RecordSceneTime::get(FSPerfStats::SceneStatType::RENDER_SWAP);
+        auto tot_frame_time_ns = NANOS/fps;
+        auto target_frame_time_ns = NANOS/(target_fps==0?1:target_fps);
+        auto tot_avatar_time_raw = FSPerfStats::StatsRecorder::getSum(AvType, FSPerfStats::StatType_t::RENDER_COMBINED);
+        auto tot_huds_time_raw = FSPerfStats::StatsRecorder::getSceneStat(FSPerfStats::StatType_t::RENDER_HUDS);
+        auto tot_sleep_time_raw = FSPerfStats::StatsRecorder::getSceneStat(FSPerfStats::StatType_t::RENDER_SLEEP);
+        auto tot_ui_time_raw = FSPerfStats::StatsRecorder::getSceneStat(FSPerfStats::StatType_t::RENDER_UI);
+        auto tot_idle_time_raw = FSPerfStats::StatsRecorder::getSceneStat(FSPerfStats::StatType_t::RENDER_IDLE);
+        auto tot_limit_time_raw = FSPerfStats::StatsRecorder::getSceneStat(FSPerfStats::StatType_t::RENDER_FPSLIMIT);
+        auto tot_swap_time_raw = FSPerfStats::StatsRecorder::getSceneStat(FSPerfStats::StatType_t::RENDER_SWAP);

-        // once the rest is extracted what is left is the scene cost (we don't include non-render activities such as network here prlloy should.)
-        auto tot_scene_time = tot_frame_time_ns - tot_avatar_time - tot_huds_time - tot_ui_time - tot_sleep_time - tot_limit_time - tot_swap_time;
+        auto tot_avatar_time_ns = FSPerfStats::raw_to_ns( tot_avatar_time_raw );
+        auto tot_huds_time_ns = FSPerfStats::raw_to_ns( tot_huds_time_raw );
+        auto tot_sleep_time_ns = FSPerfStats::raw_to_ns( tot_sleep_time_raw );
+        auto tot_ui_time_ns = FSPerfStats::raw_to_ns( tot_ui_time_raw );
+        auto tot_idle_time_ns = FSPerfStats::raw_to_ns( tot_idle_time_raw );
+        auto tot_limit_time_ns = FSPerfStats::raw_to_ns( tot_limit_time_raw );
+        auto tot_swap_time_ns = FSPerfStats::raw_to_ns( tot_swap_time_raw );
+
+        // once the rest is extracted what is left is the scene cost
+        auto tot_scene_time_ns = tot_frame_time_ns - tot_avatar_time_ns - tot_huds_time_ns - tot_ui_time_ns - tot_sleep_time_ns - tot_limit_time_ns - tot_swap_time_ns - tot_idle_time_ns;
        // remove time spent sleeping for fps limit or out of focus.
-        tot_frame_time_ns -= tot_limit_time;
-        tot_frame_time_ns -= tot_sleep_time;
+        tot_frame_time_ns -= tot_limit_time_ns;
+        tot_frame_time_ns -= tot_sleep_time_ns;

        if(tot_frame_time_ns == 0)
        {
            LL_WARNS("performance") << "things went wrong, quit while we can." << LL_ENDL;
            return;
        }
-        auto pct_avatar_time = (tot_avatar_time*100)/tot_frame_time_ns;
-        auto pct_huds_time = (tot_huds_time*100)/tot_frame_time_ns;
-        auto pct_ui_time = (tot_ui_time*100)/tot_frame_time_ns;
-        auto pct_idle_time = (tot_idle_time*100)/tot_frame_time_ns;
-        auto pct_swap_time = (tot_swap_time*100)/tot_frame_time_ns;
-        auto pct_scene_time = (tot_scene_time*100)/tot_frame_time_ns;
+        auto pct_avatar_time = (tot_avatar_time_ns * 100)/tot_frame_time_ns;
+        auto pct_huds_time = (tot_huds_time_ns * 100)/tot_frame_time_ns;
+        auto pct_ui_time = (tot_ui_time_ns * 100)/tot_frame_time_ns;
+        auto pct_idle_time = (tot_idle_time_ns * 100)/tot_frame_time_ns;
+        auto pct_swap_time = (tot_swap_time_ns * 100)/tot_frame_time_ns;
+        auto pct_scene_time = (tot_scene_time_ns * 100)/tot_frame_time_ns;

        args["AV_FRAME_PCT"] = llformat("%02u", (U32)llround(pct_avatar_time));
        args["HUDS_FRAME_PCT"] = llformat("%02u", (U32)llround(pct_huds_time));
@ -217,13 +235,13 @@ void LLFloaterPerformance::draw()
        getChild<LLTextBox>("frame_breakdown")->setText(getString("frame_stats", args));
        
        auto textbox = getChild<LLTextBox>("fps_warning");
-        if(tot_sleep_time > 0) // We are sleeping because view is not focussed
+        if(tot_sleep_time_raw > 0) // We are sleeping because view is not focussed
        {
            textbox->setVisible(true);
            textbox->setText(getString("focus_fps"));
            textbox->setColor(LLUIColorTable::instance().getColor("DrYellow"));
        }
-        else if (tot_limit_time > 0)
+        else if (tot_limit_time_raw > 0)
        {
            textbox->setVisible(true);
            textbox->setText(getString("limit_fps", args));
@ -242,11 +260,11 @@ void LLFloaterPerformance::draw()

        if( auto_tune )
        {
-            auto av_render_max = FSPerfStats::RecordObjectTime<const LLVOAvatar*>::getMax(FSPerfStats::ObjStatType::RENDER_COMBINED);
+            auto av_render_max_raw = FSPerfStats::StatsRecorder::getMax(AvType, FSPerfStats::StatType_t::RENDER_COMBINED);

            // if( target_frame_time_ns <= tot_frame_time_ns )
            // {
-            //     U32 non_avatar_time_ns = tot_frame_time_ns - tot_avatar_time;
+            //     U32 non_avatar_time_ns = tot_frame_time_ns - tot_avatar_time_raw;
            //     if( non_avatar_time_ns < target_frame_time_ns )
            //     {
            //         F32 target_avatar_time_ms {F32(target_frame_time_ns-non_avatar_time_ns)/1000000};
@ -255,33 +273,41 @@ void LLFloaterPerformance::draw()
            //     }
            // }

+            // Is our target frame time lower than current? If so we need to take action to reduce draw overheads.
            if( target_frame_time_ns <= tot_frame_time_ns )
            {
                LL_INFOS() << "AUTO_TUNE: adapting frame rate" << LL_ENDL;
-                U32 non_avatar_time_ns = tot_frame_time_ns - tot_avatar_time;
-                LL_INFOS() << "AUTO_TUNE: adapting frame rate: target_frame=" << target_frame_time_ns << " nonav_frame_time=" << non_avatar_time_ns << " headroom=" << target_frame_time_ns - non_avatar_time_ns << LL_ENDL;
+                U32 non_avatar_time_ns = tot_frame_time_ns - tot_avatar_time_ns;
+                LL_INFOS() << "AUTO_TUNE: adapting frame rate: target_frame=" << target_frame_time_ns << " nonav_frame_time=" << non_avatar_time_ns << " headroom=" << (S64)target_frame_time_ns - non_avatar_time_ns << LL_ENDL;
+                // If the target frame time < non avatar frame time then we can pototentially reach it.
                if( non_avatar_time_ns < target_frame_time_ns )
                {
                    U64 target_avatar_time_ns {target_frame_time_ns-non_avatar_time_ns};
                    LL_INFOS() << "AUTO_TUNE: avatar_budget:" << target_avatar_time_ns << LL_ENDL;
-                    if(target_avatar_time_ns < tot_avatar_time)
+                    if(target_avatar_time_ns < tot_avatar_time_ns)
                    {
-                        F32 new_render_limit = (F32)(av_render_max-100000)/1000000;
-                        if(new_render_limit >= max_render_cost)
+                        F32 new_render_limit_ms = (F32)(FSPerfStats::raw_to_ms(av_render_max_raw)-0.1);
+                        if(new_render_limit_ms >= max_render_cost)
                        {
                            // we caught a bad frame possibly with a forced refresh render.
-                            new_render_limit = max_render_cost - 0.1;  
+                            new_render_limit_ms = max_render_cost - 0.1;  
                        }
-                        gSavedSettings.setF32( "RenderAvatarMaxART",  new_render_limit);
-                        LL_INFOS() << "AUTO_TUNE: avatar_budget adjusted to:" << new_render_limit << LL_ENDL;
+                        gSavedSettings.setF32( "RenderAvatarMaxART",  new_render_limit_ms);
+                        LL_INFOS() << "AUTO_TUNE: avatar_budget adjusted to:" << new_render_limit_ms << LL_ENDL;
                    }
-                    LL_INFOS() << "AUTO_TUNE: Target frame time:"<<target_frame_time_ns/1000000 << " (non_avatar is " << non_avatar_time_ns/1000000 << ") Max cost limited=" << max_render_cost << LL_ENDL;
+                    LL_INFOS() << "AUTO_TUNE: Target frame time:"<<target_frame_time_ns/1000000 << "ms (non_avatar is " << non_avatar_time_ns/1000000 << "ms) Max cost limited=" << max_render_cost << LL_ENDL;
+                }
+                else
+                {
+                    // TODO(Beq): Set advisory text for further actions
+                    LL_INFOS() << "AUTO_TUNE: Unachievable target . Target frame time:"<<target_frame_time_ns/1000000 << "ms (non_avatar is " << non_avatar_time_ns/1000000 << "ms)" << LL_ENDL;
+                    textbox->setColor(LLUIColorTable::instance().getColor("red"));
                }
            }
            else 
                if( target_frame_time_ns > (tot_frame_time_ns + max_render_cost))
            {
-                // if we have more space to spare let's shift up little in the hope we'll restore an avatar.
+                // if we have more time to spare let's shift up little in the hope we'll restore an avatar.
                gSavedSettings.setF32( "RenderAvatarMaxART",  max_render_cost + 0.5 );
            }
        }
@ -339,18 +365,20 @@ void LLFloaterPerformance::populateHUDList()
    hud_complexity_list_t::iterator iter = complexity_list.begin();
    hud_complexity_list_t::iterator end = complexity_list.end();

+    static auto freq_divisor = get_timer_info().mClockFrequencyInv;
+
    U32 max_complexity = 0;
    for (; iter != end; ++iter)
    {
        max_complexity = llmax(max_complexity, (*iter).objectsCost);
    }
   
-    auto huds_max_render_time = FSPerfStats::RecordObjectTime<LLHUDObject*>::getMax(FSPerfStats::ObjStatType::RENDER_GEOMETRY);
+    auto huds_max_render_time_raw = FSPerfStats::StatsRecorder::getMax(HudType, FSPerfStats::StatType_t::RENDER_GEOMETRY);
    for (iter = complexity_list.begin(); iter != end; ++iter)
    {
        LLHUDComplexity hud_object_complexity = *iter;     
        auto hud_ptr = hud_object_complexity.objectPtr;
-        auto hud_render_time = FSPerfStats::RecordObjectTime<const LLViewerObject*>::get(hud_ptr, FSPerfStats::ObjStatType::RENDER_GEOMETRY);
+        auto hud_render_time_raw = FSPerfStats::StatsRecorder::get(HudType, hud_ptr->getID(), FSPerfStats::StatType_t::RENDER_GEOMETRY);
        LLSD item;
        item["special_id"] = hud_object_complexity.objectId;
        item["target"] = LLNameListCtrl::SPECIAL;
@ -358,15 +386,15 @@ void LLFloaterPerformance::populateHUDList()
        row[0]["column"] = "complex_visual";
        row[0]["type"] = "bar";
        LLSD& value = row[0]["value"];
-        value["ratio"] = (F32)hud_render_time / huds_max_render_time;
+        value["ratio"] = (F32)hud_render_time_raw / huds_max_render_time_raw;
        value["bottom"] = BAR_BOTTOM_PAD;
        value["left_pad"] = BAR_LEFT_PAD;
        value["right_pad"] = BAR_RIGHT_PAD;

        row[1]["column"] = "complex_value";
        row[1]["type"] = "text";
-        LL_INFOS() << "HUD : hud[" << hud_ptr << " time:" << hud_render_time <<" total_time:" << huds_max_render_time << LL_ENDL;
-        row[1]["value"] = llformat("%.2f",((double)hud_render_time / 1000000000));
+        LL_INFOS() << "HUD : hud[" << hud_ptr << " time:" << hud_render_time_raw <<" total_time:" << huds_max_render_time_raw << LL_ENDL;
+        row[1]["value"] = llformat( "%.3f",FSPerfStats::raw_to_us(hud_render_time_raw) );
        row[1]["font"]["name"] = "SANSSERIF";
 
        row[2]["column"] = "name";
@ -401,42 +429,49 @@ void LLFloaterPerformance::populateObjectList()
    object_complexity_list_t::iterator iter = complexity_list.begin();
    object_complexity_list_t::iterator end = complexity_list.end();

+    static auto freq_divisor = get_timer_info().mClockFrequencyInv;
+
    U32 max_complexity = 0;
    for (; iter != end; ++iter)
    {
        max_complexity = llmax(max_complexity, (*iter).objectCost);
    }

-    auto max_render_time = FSPerfStats::RecordAttachmentTime<U32>::getMax(FSPerfStats::ObjStatType::RENDER_GEOMETRY);
+    auto att_max_render_time_raw = FSPerfStats::StatsRecorder::getMax(AttType, FSPerfStats::StatType_t::RENDER_COMBINED);

    for (iter = complexity_list.begin(); iter != end; ++iter)
    {
        LLObjectComplexity object_complexity = *iter;        
-        // S32 obj_cost_short = llmax((S32)object_complexity.objectCost / 1000, 1);
-        auto attach_render_time = FSPerfStats::RecordAttachmentTime<U32>::get(object_complexity.objectId.getCRC32(), FSPerfStats::ObjStatType::RENDER_GEOMETRY);
+        S32 obj_cost_short = llmax((S32)object_complexity.objectCost / 1000, 1);
+        auto attach_render_time_raw = FSPerfStats::StatsRecorder::get(AttType, object_complexity.objectId, FSPerfStats::StatType_t::RENDER_COMBINED);
        LLSD item;
        item["special_id"] = object_complexity.objectId;
        item["target"] = LLNameListCtrl::SPECIAL;
        LLSD& row = item["columns"];
-        row[0]["column"] = "complex_visual";
+        row[0]["column"] = "art_visual";
        row[0]["type"] = "bar";
        LLSD& value = row[0]["value"];
-        value["ratio"] = (F32)attach_render_time / max_render_time;
+        value["ratio"] = (F32)attach_render_time_raw / att_max_render_time_raw;
        value["bottom"] = BAR_BOTTOM_PAD;
        value["left_pad"] = BAR_LEFT_PAD;
        value["right_pad"] = BAR_RIGHT_PAD;

-        row[1]["column"] = "complex_value";
+        row[1]["column"] = "art_value";
        row[1]["type"] = "text";
        // row[1]["value"] = std::to_string(obj_cost_short);
-        row[1]["value"] = llformat("%.3f",((double)attach_render_time / 1000000));
+        row[1]["value"] = llformat( "%.4f", FSPerfStats::raw_to_us(attach_render_time_raw) );
        row[1]["font"]["name"] = "SANSSERIF";

-        row[2]["column"] = "name";
+        row[2]["column"] = "complex_value";
        row[2]["type"] = "text";
-        row[2]["value"] = object_complexity.objectName;
+        row[2]["value"] = std::to_string(obj_cost_short);
        row[2]["font"]["name"] = "SANSSERIF";

+        row[3]["column"] = "name";
+        row[3]["type"] = "text";
+        row[3]["value"] = object_complexity.objectName;
+        row[3]["font"]["name"] = "SANSSERIF";
+
        LLScrollListItem* obj = mObjectList->addElement(item);
        if (obj)
        {
@ -467,7 +502,7 @@ void LLFloaterPerformance::populateNearbyList()
    getNearbyAvatars(valid_nearby_avs);

    std::vector<LLCharacter*>::iterator char_iter = valid_nearby_avs.begin();
-    auto render_max = FSPerfStats::RecordObjectTime<const LLVOAvatar*>::getMax(FSPerfStats::ObjStatType::RENDER_COMBINED);
+    auto av_render_max_raw = FSPerfStats::StatsRecorder::getMax(AvType, FSPerfStats::StatType_t::RENDER_COMBINED);
    while (char_iter != valid_nearby_avs.end())
    {
        LLVOAvatar* avatar = dynamic_cast<LLVOAvatar*>(*char_iter);
@ -477,40 +512,46 @@ void LLFloaterPerformance::populateNearbyList()
            if(overall_appearance == LLVOAvatar::AOA_INVISIBLE)
                continue;

-            // S32 complexity_short = llmax((S32)avatar->getVisualComplexity() / 1000, 1);
-            auto render_av  = FSPerfStats::RecordObjectTime<const LLVOAvatar*>::get(avatar,FSPerfStats::ObjStatType::RENDER_COMBINED);
+            S32 complexity_short = llmax((S32)avatar->getVisualComplexity() / 1000, 1);
+            auto render_av_raw  = FSPerfStats::StatsRecorder::get(AvType, avatar->getID(),FSPerfStats::StatType_t::RENDER_COMBINED);
            auto is_slow = avatar->isTooSlow(true);
            // auto is_slow_without_shadows = avatar->isTooSlow();

            LLSD item;
            item["id"] = avatar->getID();
            LLSD& row = item["columns"];
-            row[0]["column"] = "complex_visual";
+            row[0]["column"] = "art_visual";
            row[0]["type"] = "bar";
            LLSD& value = row[0]["value"];
-            value["ratio"] = (double)render_av / render_max;
+            value["ratio"] = (double)render_av_raw / av_render_max_raw;
            value["bottom"] = BAR_BOTTOM_PAD;
            value["left_pad"] = BAR_LEFT_PAD;
            value["right_pad"] = BAR_RIGHT_PAD;

-            row[1]["column"] = "complex_value";
+            row[1]["column"] = "art_value";
            row[1]["type"] = "text";
            if(is_slow)
            {
-                row[1]["value"] = llformat("%.2f", ((double)avatar->getLastART() / 1000000));
+                row[1]["value"] = llformat( "%.2f", FSPerfStats::raw_to_ms( avatar->getLastART() ) );
            }
            else
            {
-                row[1]["value"] = llformat("%.2f",((double)render_av / 1000000));
+                row[1]["value"] = llformat( "%.2f", FSPerfStats::raw_to_ms( render_av_raw ) );
            }
-
            row[1]["font"]["name"] = "SANSSERIF";
            row[1]["width"] = "50";

-            row[2]["column"] = "name";
+            row[2]["column"] = "complex_value";
            row[2]["type"] = "text";
-            row[2]["value"] = avatar->getFullname();
+            row[2]["value"] = std::to_string(complexity_short);
            row[2]["font"]["name"] = "SANSSERIF";
+            row[2]["width"] = "50";
+
+
+            row[3]["column"] = "name";
+            row[3]["type"] = "text";
+            row[3]["value"] = avatar->getFullname();
+            row[3]["font"]["name"] = "SANSSERIF";

            LLScrollListItem* av_item = mNearbyList->addElement(item);
            if(av_item)
--- a/indra/newview/llspatialpartition.cpp
+++ b/indra/newview/llspatialpartition.cpp
@ -1128,6 +1128,7 @@ public:
 	
 	virtual S32 frustumCheck(const LLViewerOctreeGroup* group)
 	{
+		FSZone;
 		S32 res = AABBInFrustumNoFarClipGroupBounds(group);
 		if (res != 0)
 		{
@ -1138,6 +1139,7 @@ public:

 	virtual S32 frustumCheckObjects(const LLViewerOctreeGroup* group)
 	{
+		FSZone;
 		S32 res = AABBInFrustumNoFarClipObjectBounds(group);
 		if (res != 0)
 		{
@ -1148,6 +1150,7 @@ public:

 	virtual void processGroup(LLViewerOctreeGroup* base_group)
 	{
+		FSZone;
 		LLSpatialGroup* group = (LLSpatialGroup*)base_group;
 		if (group->needsUpdate() ||
 			group->getVisible(LLViewerCamera::sCurCameraID) < LLDrawable::getCurrentFrame() - 1)
--- a/indra/newview/llviewerdisplay.cpp
+++ b/indra/newview/llviewerdisplay.cpp
@ -1193,7 +1193,7 @@ void display(BOOL rebuild, F32 zoom_factor, int subfield, BOOL for_snapshot)

 void render_hud_attachments()
 {
-	FSPerfStats::RecordSceneTime T (FSPerfStats::SceneStatType::RENDER_HUDS);
+	FSPerfStats::RecordSceneTime T (LLUUID{}, FSPerfStats::StatType_t::RENDER_HUDS);
 	gGL.matrixMode(LLRender::MM_PROJECTION);
 	gGL.pushMatrix();
 	gGL.matrixMode(LLRender::MM_MODELVIEW);
@ -1401,7 +1401,7 @@ bool setup_hud_matrices(const LLRect& screen_region)

 void render_ui(F32 zoom_factor, int subfield)
 {
-	FSPerfStats::RecordSceneTime T (FSPerfStats::SceneStatType::RENDER_UI);
+	FSPerfStats::RecordSceneTime T (const LLUUID{}, FSPerfStats::StatType_t::RENDER_UI);
    LL_RECORD_BLOCK_TIME(FTM_RENDER_UI);

 	LLGLState::checkStates();
@ -1487,7 +1487,7 @@ static LLTrace::BlockTimerStatHandle FTM_SWAP("Swap");

 void swap()
 {
-	FSPerfStats::RecordSceneTime T (FSPerfStats::SceneStatType::RENDER_SWAP);
+	FSPerfStats::RecordSceneTime T (const LLUUID{}, FSPerfStats::StatType_t::RENDER_SWAP);
 	LL_RECORD_BLOCK_TIME(FTM_SWAP);

 	if (gDisplaySwapBuffers)
--- a/indra/newview/llviewerjointmesh.cpp
+++ b/indra/newview/llviewerjointmesh.cpp
@ -233,20 +233,12 @@ U32 LLViewerJointMesh::drawShape( F32 pixelArea, BOOL first_pass, BOOL is_dummy)
 	}


+	std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
 	auto vobj = mFace->getViewerObject();
-	if(vobj && !vobj->asAvatar() && vobj->getAvatar()->isSelf())
+	if( vobj && vobj->isAttachment() )
 	{
-		LLViewerObject * vtop = vobj;
-		LLViewerObject * par  = (LLViewerObject *) vobj->getParent();
-
-		while (par && !(par->asAvatar()))
-		{
-			vtop = par;
-			par = (LLViewerObject *)vtop->getParent();
-		}
-		vobj = vtop;
+		T = trackMyAttachment(vobj);
 	}
-	FSPerfStats::RecordAttachmentTime<U32> T(vobj?vobj->getAttachmentItemID().getCRC32():0, FSPerfStats::ObjStatType::RENDER_GEOMETRY);

 	U32 triangle_count = 0;

--- a/indra/newview/llvieweroctree.cpp
+++ b/indra/newview/llvieweroctree.cpp
@ -1361,6 +1361,7 @@ bool LLViewerOctreeCull::earlyFail(LLViewerOctreeGroup* group)
 //virtual 
 void LLViewerOctreeCull::traverse(const OctreeNode* n)
 {
+	FSZone;
 	LLViewerOctreeGroup* group = (LLViewerOctreeGroup*) n->getListener(0);

 	if (earlyFail(group))
@ -1371,14 +1372,17 @@ void LLViewerOctreeCull::traverse(const OctreeNode* n)
 	if (mRes == 2 || 
 		(mRes && group->hasState(LLViewerOctreeGroup::SKIP_FRUSTUM_CHECK)))
 	{	//fully in, just add everything
+		FSZoneN("AllInside");
 		OctreeTraveler::traverse(n);
 	}
 	else
 	{
+		FSZoneN("Check inside?")
 		mRes = frustumCheck(group);
 				
 		if (mRes)
 		{ //at least partially in, run on down
+			FSZoneN("PartiallyIn");
 			OctreeTraveler::traverse(n);
 		}

--- a/indra/newview/llvoavatar.cpp
+++ b/indra/newview/llvoavatar.cpp
@ -9157,38 +9157,38 @@ bool LLVOAvatar::isTooSlow(bool combined) const
 	}

 	// Either we're not stale or we've updated.
-	U64 render_time;
-	U64 render_geom_time;
+	U64 render_time_raw;
+	U64 render_geom_time_raw;

 	if(!mARTCapped)
 	{
 		// no cap, so we use the live values
-		render_time = FSPerfStats::RecordObjectTime<const LLVOAvatar*>::get(this,FSPerfStats::ObjStatType::RENDER_COMBINED);
-		render_geom_time = FSPerfStats::RecordObjectTime<const LLVOAvatar*>::get(this,FSPerfStats::ObjStatType::RENDER_GEOMETRY);
+		render_time_raw = FSPerfStats::StatsRecorder::get(FSPerfStats::ObjType_t::OT_AVATAR, this->getID(), FSPerfStats::StatType_t::RENDER_COMBINED);
+		render_geom_time_raw = FSPerfStats::StatsRecorder::get(FSPerfStats::ObjType_t::OT_AVATAR, this->getID(), FSPerfStats::StatType_t::RENDER_GEOMETRY);
 	}
 	else
 	{
 		// use the cached values.
-		render_time = mRenderTime;
-		render_geom_time = mGeomTime;		
+		render_time_raw = mRenderTime;
+		render_geom_time_raw = mGeomTime;		
 	}
-	if( (LLVOAvatar::sRenderTimeCap_ns > 0) && (render_time >= LLVOAvatar::sRenderTimeCap_ns) ) 
+	if( (LLVOAvatar::sRenderTimeCap_ns > 0) && (FSPerfStats::raw_to_ns(render_time_raw) >= LLVOAvatar::sRenderTimeCap_ns) ) 
 	{
 		if(!mARTCapped)
 		{			
 			// if we weren't capped, we are now
-			abuse_constness->mRenderTime = FSPerfStats::RecordObjectTime<const LLVOAvatar*>::get(this,FSPerfStats::ObjStatType::RENDER_COMBINED);
-			abuse_constness->mGeomTime = FSPerfStats::RecordObjectTime<const LLVOAvatar*>::get(this,FSPerfStats::ObjStatType::RENDER_GEOMETRY);
+			abuse_constness->mRenderTime = render_time_raw;
+			abuse_constness->mGeomTime = render_geom_time_raw;
 			abuse_constness->mARTStale = false;
 			abuse_constness->mARTCapped = true;
 			abuse_constness->mLastARTUpdateFrame = LLFrameTimer::getFrameCount();
-			LL_INFOS() << this->getFullname() << " ("<< (combined?"combined":"geometry") << ") mLastART too high = " << render_time << " vs ("<< LLVOAvatar::sRenderTimeCap_ns << " set @ " << mLastARTUpdateFrame << LL_ENDL;
+			LL_INFOS() << this->getFullname() << " ("<< (combined?"combined":"geometry") << ") mLastART too high = " << FSPerfStats::raw_to_ns(render_time_raw) << " vs ("<< LLVOAvatar::sRenderTimeCap_ns << " set @ " << mLastARTUpdateFrame << LL_ENDL;
 		}
 		// return true only if that is the case in the context of the combined/geom_only flag.
-		return combined ? true : (render_geom_time >= LLVOAvatar::sRenderTimeCap_ns);
+		return combined ? true : (render_geom_time_raw >= LLVOAvatar::sRenderTimeCap_ns);
 	}

-	LL_INFOS() << this->getFullname() << " ("<< (combined?"combined":"geometry") << ") good render time = " << render_time << " vs ("<< LLVOAvatar::sRenderTimeCap_ns << " set @ " << mLastARTUpdateFrame << LL_ENDL;
+	LL_INFOS() << this->getFullname() << " ("<< (combined?"combined":"geometry") << ") good render time = " << FSPerfStats::raw_to_ns(render_time_raw) << " vs ("<< LLVOAvatar::sRenderTimeCap_ns << " set @ " << mLastARTUpdateFrame << LL_ENDL;
 	abuse_constness->mARTCapped = false;
 	return false;
 }
--- a/indra/newview/llvovolume.cpp
+++ b/indra/newview/llvovolume.cpp
@ -5546,7 +5546,7 @@ void LLVolumeGeometryManager::registerFace(LLSpatialGroup* group, LLFace* facep,
 			}
 		}
 		
-		if (type == LLRenderPass::PASS_ALPHA)
+		// if (type == LLRenderPass::PASS_ALPHA) // <FS:Beq> allow tracking through pipeline
 		{ //for alpha sorting
 			facep->setDrawInfo(draw_info);
 		}
@ -5784,6 +5784,14 @@ void LLVolumeGeometryManager::rebuildGeom(LLSpatialGroup* group)
 				continue;
 			}

+			std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
+			// <FS:Beq> Capture render times
+			if(vobj->isAttachment())
+			{
+				T= trackMyAttachment(vobj);
+			}
+			// </FS:Beq>
+
 //<FS:Beq> Stop doing stupid stuff we don;t need to.
 // Moving this inside a debug enabled check
 //			std::string vobj_name = llformat("Vol%p", vobj);
@ -6382,20 +6390,14 @@ void LLVolumeGeometryManager::rebuildMesh(LLSpatialGroup* group)

 			if (drawablep && !drawablep->isDead() && drawablep->isState(LLDrawable::REBUILD_ALL) && !drawablep->isState(LLDrawable::RIGGED) )
 			{
+				FSZoneN("Rebuild all non-Rigged")
 				LLVOVolume* vobj = drawablep->getVOVolume();
-				LLVOVolume* rootAtt{};
+				std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
+
 				if(vobj->isAttachment())
 				{
-					auto par = (LLVOVolume*)vobj->getParent();
-					rootAtt = vobj;
-					while( par->isAttachment() )
-					{
-						rootAtt = par;
-						par = (LLVOVolume*)par->getParent();
-					}
-					LL_INFOS() << "recording time for ATT@" << rootAtt << " " << (rootAtt?rootAtt->getAttachmentItemName():"null") << LL_ENDL;
+					T = trackMyAttachment(vobj);
 				}
-				FSPerfStats::RecordAttachmentTime<U32> T(rootAtt?rootAtt->getAttachmentItemID().getCRC32():0, FSPerfStats::ObjStatType::RENDER_GEOMETRY);
 				//<FS:Beq> avoid unfortunate sleep during trylock by static check
 				//if(debugLoggingEnabled("AnimatedObjectsLinkset"))
 				static auto debug_logging_on = debugLoggingEnabled("AnimatedObjectsLinkset");
@ -6808,10 +6810,18 @@ U32 LLVolumeGeometryManager::genDrawInfo(LLSpatialGroup* group, U32 mask, LLFace
 		U32 indices_index = 0;
 		U16 index_offset = 0;

+		std::unique_ptr<FSPerfStats::RecordAttachmentTime> T{};
+		LLViewerObject * lastVObj{nullptr};
 		while (face_iter < i)
 		{
 			//update face indices for new buffer
 			facep = *face_iter;
+			LLViewerObject* vobj = facep->getViewerObject();
+			if(vobj && vobj != lastVObj && vobj->isAttachment())
+			{
+				T = trackMyAttachment(vobj);
+				lastVObj = vobj;
+			}			
 			if (buffer.isNull())
 			{
 				// Bulk allocation failed
@ -7027,8 +7037,12 @@ U32 LLVolumeGeometryManager::genDrawInfo(LLSpatialGroup* group, U32 mask, LLFace
 			else if (is_alpha)
 			{
 				// can we safely treat this as an alpha mask?
-				if (facep->getFaceColor().mV[3] <= 0.f)
+				// <FS:Beq> Nothing actually sets facecolor use the TE alpha instead.
+				// if (facep->getFaceColor().mV[3] <= 0.f)
+				if (te->getAlpha() <=0.f || facep->getFaceColor().mV[3] <= 0.f)
+				// </FS:Beq>
 				{ //100% transparent, don't render unless we're highlighting transparent
+					FSZoneN("facep->alpha -> invisible");
 					registerFace(group, facep, LLRenderPass::PASS_ALPHA_INVISIBLE);
 				}
 				else if (facep->canRenderAsMask())
--- a/indra/newview/pipeline.cpp
+++ b/indra/newview/pipeline.cpp
@ -3549,6 +3549,8 @@ void LLPipeline::stateSort(LLCamera& camera, LLCullResult &result)
 	//LLVertexBuffer::unbind();

 	grabReferences(result);
+	{
+		FSZoneN("checkOcclusionAndRebuildMesh");
 	for (LLCullResult::sg_iterator iter = sCull->beginDrawableGroups(); iter != sCull->endDrawableGroups(); ++iter)
 	{
 		LLSpatialGroup* group = *iter;
@ -3572,9 +3574,11 @@ void LLPipeline::stateSort(LLCamera& camera, LLCullResult &result)
 			}
 		}
 	}
+	}

 	if (LLViewerCamera::sCurCameraID == LLViewerCamera::CAMERA_WORLD)
 	{
+		FSZoneN("WorldCamera");
 		LLSpatialGroup* last_group = NULL;
 		BOOL fov_changed = LLViewerCamera::getInstance()->isDefaultFOVChanged();
 		for (LLCullResult::bridge_iterator i = sCull->beginVisibleBridge(); i != sCull->endVisibleBridge(); ++i)
@ -3608,7 +3612,8 @@ void LLPipeline::stateSort(LLCamera& camera, LLCullResult &result)
 			last_group->mLastUpdateDistance = last_group->mDistance;
 		}
 	}
-
+	{
+		FSZoneN("StateSort: visible groups");
 	for (LLCullResult::sg_iterator iter = sCull->beginVisibleGroups(); iter != sCull->endVisibleGroups(); ++iter)
 	{
 		LLSpatialGroup* group = *iter;
@ -3627,7 +3632,7 @@ void LLPipeline::stateSort(LLCamera& camera, LLCullResult &result)
 				group->rebuildMesh();
 			}
 		}
-	}
+	}}
 	
 	{
 		LL_RECORD_BLOCK_TIME(FTM_STATESORT_DRAWABLE);
@ -3983,6 +3988,8 @@ void LLPipeline::postSort(LLCamera& camera)

 	LL_PUSH_CALLSTACKS();
 	//rebuild drawable geometry
+	{
+		FSZoneN("PostSort: rebuildGeom")
 	for (LLCullResult::sg_iterator i = sCull->beginDrawableGroups(); i != sCull->endDrawableGroups(); ++i)
 	{
 		LLSpatialGroup* group = *i;
@ -4001,6 +4008,8 @@ void LLPipeline::postSort(LLCamera& camera)

 	
 	//build render map
+	{
+		FSZoneN("build render map");
 	for (LLCullResult::sg_iterator i = sCull->beginVisibleGroups(); i != sCull->endVisibleGroups(); ++i)
 	{
 		LLSpatialGroup* group = *i;
@ -4046,6 +4055,7 @@ void LLPipeline::postSort(LLCamera& camera)

 		if (hasRenderType(LLPipeline::RENDER_TYPE_PASS_ALPHA))
 		{
+			FSZone("Collect Alpha groups");
 			LLSpatialGroup::draw_map_t::iterator alpha = group->mDrawMap.find(LLRenderPass::PASS_ALPHA);
 			
 			if (alpha != group->mDrawMap.end())
@ -4071,6 +4081,7 @@ void LLPipeline::postSort(LLCamera& camera)
 			}
 		}
 	}
+	}
 	
 	//flush particle VB
 	if (LLVOPartGroup::sVB)
@ -4096,12 +4107,14 @@ void LLPipeline::postSort(LLCamera& camera)
 		
 		glBeginQueryARB(GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN, mMeshDirtyQueryObject);
 	}*/
-
+	{
+		FSZoneN("rebuild delayed upd groups")	}
 	//pack vertex buffers for groups that chose to delay their updates
 	for (LLSpatialGroup::sg_vector_t::iterator iter = mMeshDirtyGroup.begin(); iter != mMeshDirtyGroup.end(); ++iter)
 	{
 		(*iter)->rebuildMesh();
 	}
+	}

 	/*if (use_transform_feedback)
 	{
@ -4110,12 +4123,17 @@ void LLPipeline::postSort(LLCamera& camera)
 	
 	mMeshDirtyGroup.clear();

+	{
+		FSZoneN("sort alpha groups")
 	if (!sShadowRender)
 	{
 		std::sort(sCull->beginAlphaGroups(), sCull->endAlphaGroups(), LLSpatialGroup::CompareDepthGreater());
 	}
+	}

 	LL_PUSH_CALLSTACKS();
+	{
+	FSZoneN("beacon rendering flags");
 	// only render if the flag is set. The flag is only set if we are in edit mode or the toggle is set in the menus
 	// Ansariel: Make beacons also show when beacons floater is closed.
 	if (/*LLFloaterReg::instanceVisible("beacons") &&*/ !sShadowRender)
@ -4169,6 +4187,7 @@ void LLPipeline::postSort(LLCamera& camera)
 			forAllVisibleDrawables(renderSoundHighlights);
 		}
 	}
+	}
 	LL_PUSH_CALLSTACKS();
 	// If managing your telehub, draw beacons at telehub and currently selected spawnpoint.
 	if (LLFloaterTelehub::renderBeacons())
@ -4178,6 +4197,7 @@ void LLPipeline::postSort(LLCamera& camera)

 	if (!sShadowRender)
 	{
+		FSZoneN("Render face highlights");
 		mSelectedFaces.clear();

 		LLPipeline::setRenderHighlightTextureChannel(gFloaterTools->getPanelFace()->getTextureChannelToEdit());
--- a/indra/newview/skins/default/xui/en/panel_performance_complexity.xml
+++ b/indra/newview/skins/default/xui/en/panel_performance_complexity.xml
@ -91,8 +91,12 @@
    width="540">
      <name_list.columns
       label=""
-       name="complex_visual"
+       name="art_visual"
       width="90" />
+      <name_list.columns
+       label=""
+       name="art_value"
+       width="80" />
      <name_list.columns
       label=""
       name="complex_value"
--- a/indra/newview/skins/default/xui/en/panel_performance_huds.xml
+++ b/indra/newview/skins/default/xui/en/panel_performance_huds.xml
@ -84,10 +84,9 @@
        <name_list.columns
         label=""
         name="complex_value"
-         width="40" />
+         width="80" />
        <name_list.columns
         label=""
         name="name"/>
  </name_list>
 </panel>
-
--- a/indra/newview/skins/default/xui/en/panel_performance_nearby.xml
+++ b/indra/newview/skins/default/xui/en/panel_performance_nearby.xml
@ -148,12 +148,16 @@
    width="540">
        <name_list.columns
         label=""
-         name="complex_visual"
+         name="art_visual"
         width="90" />
+        <name_list.columns
+         label=""
+         name="art_value"
+         width="80" />
        <name_list.columns
         label=""
         name="complex_value"
-         width="50" />
+         width="40" />
        <name_list.columns
         label=""
         name="name"/>