Linux: Pull in Alchemy's vertex cache optimizer fix to smooth out triangle soup in certain hair/tails - Thank you!

2023-02-16 17:46:43 +01:00 · 2023-02-16 17:46:43 +01:00 · 79c9873cd0
parent baa9c6d731
commit 79c9873cd0
1 changed files with 241 additions and 3 deletions
--- a/indra/llmath/llvolume.cpp
+++ b/indra/llmath/llvolume.cpp
@ -52,6 +52,12 @@
 #include "llmeshoptimizer.h"
 #include "lltimer.h"

+// <FS:Zi> Use Alchemy's vertex cache optimizer for Linux. Thank you!
+#ifdef LL_LINUX
+#include "meshoptimizer/meshoptimizer.h"
+#endif
+// </FS:Zi> Use Alchemy's vertex cache optimizer for Linux. Thank you!
+
 #define DEBUG_SILHOUETTE_BINORMALS 0
 #define DEBUG_SILHOUETTE_NORMALS 0 // TomY: Use this to display normals using the silhouette
 #define DEBUG_SILHOUETTE_EDGE_MAP 0 // DaveP: Use this to display edge map using the silhouette
@ -5432,6 +5438,204 @@ public:
 	}
 };

+// <FS:Zi> Use Alchemy's vertex cache optimizer for Linux. Thank you!
+#ifdef LL_LINUX
+
+bool allocateVertices(LLVolumeFace* self, S32 num_verts)
+{
+	bool copy = false;
+
+	if (!copy || !num_verts)
+	{
+		ll_aligned_free<64>(self->mPositions);
+		self->mPositions = nullptr;
+		self->mNormals = nullptr;
+		self->mTexCoords = nullptr;
+	}
+
+	if (num_verts)
+	{
+		const U32 new_vsize = num_verts * sizeof(LLVector4a);
+		const U32 new_nsize = new_vsize;
+		const U32 new_tcsize = (num_verts * sizeof(LLVector2) + 0xF) & ~0xF;
+		const U32 new_size = new_vsize + new_nsize + new_tcsize;
+
+		//allocate new buffer space
+		LLVector4a* old_buf = self->mPositions;
+		self->mPositions = (LLVector4a*)ll_aligned_malloc<64>(new_size);
+		if (!self->mPositions)
+		{
+			LL_WARNS("LLVOLUME") << "Allocation of positions vector[" << new_size << "] failed. " << LL_ENDL;
+			return false;
+		}
+		self->mNormals = self->mPositions + num_verts;
+		self->mTexCoords = (LLVector2*)(self->mNormals + num_verts);
+
+		if (copy && old_buf)
+		{
+			U32 verts_to_copy = std::min(self->mNumVertices, num_verts);
+			if (verts_to_copy)
+			{
+				const U32 old_vsize = verts_to_copy * sizeof(LLVector4a);
+				const U32 old_nsize = old_vsize;
+				const U32 old_tcsize = (verts_to_copy * sizeof(LLVector2) + 0xF) & ~0xF;
+
+				LLVector4a::memcpyNonAliased16((F32*)self->mPositions, (F32*)old_buf, old_vsize);
+				LLVector4a::memcpyNonAliased16((F32*)self->mNormals, (F32*)(old_buf + self->mNumVertices), old_nsize);
+				LLVector4a::memcpyNonAliased16((F32*)self->mTexCoords, (F32*)(old_buf + self->mNumVertices * 2), old_tcsize);
+			}
+			ll_aligned_free<64>(old_buf);
+		}
+	}
+
+	self->mNumAllocatedVertices = num_verts;
+	return true;
+}
+
+bool LLVolumeFace::cacheOptimize()
+{
+	llassert(!mOptimized);
+	mOptimized = TRUE;
+
+    if (mNumVertices < 3 || mNumIndices < 3)
+    { //nothing to do
+        return true;
+    }
+
+    struct buffer_data_t {
+        void** dst;		// Double pointer to volume attribute data. Avoids fixup after reallocating buffers on resize.
+        void* scratch;	// Scratch buffer. Allocated with vert count from meshopt_generateVertexRemapMulti
+        size_t stride;	// Stride between continguous attributes
+    };
+    std::vector<meshopt_Stream> streams;	// Contains data necessary for meshopt_generateVertexRemapMulti call
+    std::vector<buffer_data_t> buffers;	// Contains data necessary for meshopt_remapVertexBuffer calls.
+
+    {
+        static struct { size_t offs; size_t size; size_t stride; } ref_streams[] = {
+                { (U64) &mPositions - (U64) this,	sizeof(float) * 3, sizeof(mPositions[0]) },
+                { (U64) &mNormals   - (U64) this,	sizeof(float) * 3, sizeof(mNormals[0]) },	// Subsection of mPositions allocation
+                { (U64) &mTexCoords - (U64) this,	sizeof(float) * 2, sizeof(mTexCoords[0]) },	// Subsection of mPositions allocation
+                { (U64) &mTangents  - (U64) this,	sizeof(float) * 3, sizeof(mTangents[0]) },
+                { (U64) &mWeights   - (U64) this,	sizeof(float) * 3, sizeof(mWeights[0]) },
+        };
+
+        for (size_t i = 0; i < sizeof(ref_streams) / sizeof(ref_streams[0]); ++i)
+        {
+            void** ptr = reinterpret_cast<void**>((char*)this + ref_streams[i].offs);
+            if (*ptr)
+            {
+                streams.push_back({ *ptr, ref_streams[i].size, ref_streams[i].stride });
+                buffers.push_back({ ptr, nullptr, ref_streams[i].stride });
+            }
+        }
+    }
+
+    std::vector<unsigned int> remap;
+    try
+    {
+        remap.reserve(mNumIndices);
+    }
+    catch (const std::bad_alloc&)
+    {
+        return false;
+    }
+
+    size_t total_vertices = meshopt_generateVertexRemapMulti(remap.data(), mIndices, mNumIndices, mNumVertices, streams.data(), streams.size());
+    meshopt_remapIndexBuffer(mIndices, mIndices, mNumIndices, remap.data());
+    bool failed = false;
+    for (auto& entry : buffers)
+    {
+        // Create scratch buffer for attribute data. Avoids extra allocs in meshopt_remapVertexBuffer calls
+        void* buf_tmp = ll_aligned_malloc_16(entry.stride * total_vertices);
+        if (!buf_tmp)
+        {
+            failed = true;
+            break;
+        }
+        entry.scratch = buf_tmp;
+        // Write to scratch buffer
+        meshopt_remapVertexBuffer(entry.scratch, *entry.dst, mNumVertices, entry.stride, remap.data());
+    }
+
+    if (failed)
+    {
+        for (auto& entry : buffers)
+        {
+            // Release scratch buffer
+            ll_aligned_free_16(entry.scratch);
+        }
+        return false;
+    }
+
+    if (mNumAllocatedVertices != total_vertices)
+    {
+        // New allocations will be transparently accessable through dereffing dest_buffers.
+        if (!allocateVertices(this, total_vertices))
+        {
+            for (auto& entry : buffers)
+            {
+                // Release scratch buffer
+                ll_aligned_free_16(entry.scratch);
+            }
+            allocateVertices(this, 0);
+            allocateWeights(0);
+            allocateTangents(0);
+            return false;
+        }
+
+        if (mWeights)
+		{
+			allocateWeights(total_vertices);
+			if(!mWeights)
+			{
+				for (auto& entry : buffers)
+				{
+					// Release scratch buffer
+					ll_aligned_free_16(entry.scratch);
+				}
+				allocateVertices(this, 0);
+				allocateWeights(0);
+				allocateTangents(0);
+				return false;
+			}
+        }
+
+        if (mTangents)
+		{
+			allocateTangents(total_vertices);
+			if(!mTangents)
+			{
+				for (auto& entry : buffers)
+				{
+					// Release scratch buffer
+					ll_aligned_free_16(entry.scratch);
+				}
+				allocateVertices(this, 0);
+				allocateWeights(0);
+				allocateTangents(0);
+				return false;
+			}
+        }
+    }
+
+    meshopt_optimizeVertexCache(mIndices, mIndices, mNumIndices, total_vertices);
+    //meshopt_optimizeOverdraw(mIndices, mIndices, mNumIndices, (float*)buffers[0].scratch, total_vertices, buffers[0].stride, 1.05f);
+    meshopt_optimizeVertexFetchRemap(remap.data(), mIndices, mNumIndices, total_vertices);
+    meshopt_remapIndexBuffer(mIndices, mIndices, mNumIndices, remap.data());
+    for (auto& entry : buffers)
+    {
+        // Write to llvolume attribute buffer
+        meshopt_remapVertexBuffer(*entry.dst, entry.scratch, total_vertices, entry.stride, remap.data());
+        // Release scratch buffer
+        ll_aligned_free_16(entry.scratch);
+    }
+    mNumVertices = total_vertices;
+
+    return true;
+}
+
+#else
+// </FS:Zi> Use Alchemy's vertex cache optimizer for Linux. Thank you!

 bool LLVolumeFace::cacheOptimize()
 { //optimize for vertex cache according to Forsyth method: 
@ -5447,7 +5651,7 @@ bool LLVolumeFace::cacheOptimize()
 	// windows version.
 	//
 	
-#ifndef LL_LINUX
+// #ifndef LL_LINUX	// <FS:Zi> Use Alchemy's vertex cache optimizer for Linux. Thank you!
 	LLVCacheLRU cache;
 	
 	if (mNumVertices < 3 || mNumIndices < 3)
@ -5691,10 +5895,11 @@ bool LLVolumeFace::cacheOptimize()

 	//std::string result = llformat("ACMR pre/post: %.3f/%.3f  --  %d triangles %d breaks", pre_acmr, post_acmr, mNumIndices/3, breaks);
 	//LL_INFOS() << result << LL_ENDL;
-#endif
+// #endif	// <FS:Zi> Use Alchemy's vertex cache optimizer for Linux. Thank you!
 	
 	return true;
 }
+#endif	// <FS:Zi> Use Alchemy's vertex cache optimizer for Linux. Thank you!

 void LLVolumeFace::createOctree(F32 scaler, const LLVector4a& center, const LLVector4a& size)
 {
@ -6641,15 +6846,48 @@ void LLVolumeFace::pushVertex(const LLVector4a& pos, const LLVector4a& norm, con

 void LLVolumeFace::allocateTangents(S32 num_verts)
 {
+// <FS:Zi> Use Alchemy's vertex cache optimizer for Linux. Thank you!
+#ifdef LL_LINUX
+	ll_aligned_free_16(mTangents);
+	mTangents = nullptr;
+	if (num_verts)
+	{
+		mTangents = (LLVector4a*)ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
+		if (!mTangents)
+		{
+			LL_WARNS("LLVOLUME") << "Allocation of binormals[" << sizeof(LLVector4a)*num_verts << "] failed" << LL_ENDL;
+			return;
+		}
+	}
+	return;
+#else
+// </FS:Zi> Use Alchemy's vertex cache optimizer for Linux. Thank you!
 	ll_aligned_free_16(mTangents);
 	mTangents = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
+#endif	// <FS:Zi> Use Alchemy's vertex cache optimizer for Linux. Thank you!
 }

 void LLVolumeFace::allocateWeights(S32 num_verts)
 {
+// <FS:Zi> Use Alchemy's vertex cache optimizer for Linux. Thank you!
+#ifdef LL_LINUX
+	ll_aligned_free_16(mWeights);
+	mWeights = nullptr;
+	if (num_verts)
+	{
+		mWeights = (LLVector4a*)ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
+		if (!mWeights)
+		{
+			LL_WARNS("LLVOLUME") << "Allocation of weights[" << sizeof(LLVector4a) * num_verts << "] failed" << LL_ENDL;
+			return;
+		}
+	}
+	return;
+#else
+// </FS:Zi> Use Alchemy's vertex cache optimizer for Linux. Thank you!
 	ll_aligned_free_16(mWeights);
 	mWeights = (LLVector4a*)ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
-    
+#endif	// <FS:Zi> Use Alchemy's vertex cache optimizer for Linux. Thank you!
 }

 void LLVolumeFace::allocateJointIndices(S32 num_verts)