diff --git a/indra/llmath/llvolume.cpp b/indra/llmath/llvolume.cpp
index d8e371e2c8..a680976918 100644
--- a/indra/llmath/llvolume.cpp
+++ b/indra/llmath/llvolume.cpp
@@ -52,6 +52,12 @@
#include "llmeshoptimizer.h"
#include "lltimer.h"
+// Use Alchemy's vertex cache optimizer for Linux. Thank you!
+#ifdef LL_LINUX
+#include "meshoptimizer/meshoptimizer.h"
+#endif
+// Use Alchemy's vertex cache optimizer for Linux. Thank you!
+
#define DEBUG_SILHOUETTE_BINORMALS 0
#define DEBUG_SILHOUETTE_NORMALS 0 // TomY: Use this to display normals using the silhouette
#define DEBUG_SILHOUETTE_EDGE_MAP 0 // DaveP: Use this to display edge map using the silhouette
@@ -5432,6 +5438,204 @@ public:
}
};
+// Use Alchemy's vertex cache optimizer for Linux. Thank you!
+#ifdef LL_LINUX
+
+bool allocateVertices(LLVolumeFace* self, S32 num_verts)
+{
+ bool copy = false;
+
+ if (!copy || !num_verts)
+ {
+ ll_aligned_free<64>(self->mPositions);
+ self->mPositions = nullptr;
+ self->mNormals = nullptr;
+ self->mTexCoords = nullptr;
+ }
+
+ if (num_verts)
+ {
+ const U32 new_vsize = num_verts * sizeof(LLVector4a);
+ const U32 new_nsize = new_vsize;
+ const U32 new_tcsize = (num_verts * sizeof(LLVector2) + 0xF) & ~0xF;
+ const U32 new_size = new_vsize + new_nsize + new_tcsize;
+
+ //allocate new buffer space
+ LLVector4a* old_buf = self->mPositions;
+ self->mPositions = (LLVector4a*)ll_aligned_malloc<64>(new_size);
+ if (!self->mPositions)
+ {
+ LL_WARNS("LLVOLUME") << "Allocation of positions vector[" << new_size << "] failed. " << LL_ENDL;
+ return false;
+ }
+ self->mNormals = self->mPositions + num_verts;
+ self->mTexCoords = (LLVector2*)(self->mNormals + num_verts);
+
+ if (copy && old_buf)
+ {
+ U32 verts_to_copy = std::min(self->mNumVertices, num_verts);
+ if (verts_to_copy)
+ {
+ const U32 old_vsize = verts_to_copy * sizeof(LLVector4a);
+ const U32 old_nsize = old_vsize;
+ const U32 old_tcsize = (verts_to_copy * sizeof(LLVector2) + 0xF) & ~0xF;
+
+ LLVector4a::memcpyNonAliased16((F32*)self->mPositions, (F32*)old_buf, old_vsize);
+ LLVector4a::memcpyNonAliased16((F32*)self->mNormals, (F32*)(old_buf + self->mNumVertices), old_nsize);
+ LLVector4a::memcpyNonAliased16((F32*)self->mTexCoords, (F32*)(old_buf + self->mNumVertices * 2), old_tcsize);
+ }
+ ll_aligned_free<64>(old_buf);
+ }
+ }
+
+ self->mNumAllocatedVertices = num_verts;
+ return true;
+}
+
+bool LLVolumeFace::cacheOptimize()
+{
+ llassert(!mOptimized);
+ mOptimized = TRUE;
+
+ if (mNumVertices < 3 || mNumIndices < 3)
+ { //nothing to do
+ return true;
+ }
+
+ struct buffer_data_t {
+ void** dst; // Double pointer to volume attribute data. Avoids fixup after reallocating buffers on resize.
+ void* scratch; // Scratch buffer. Allocated with vert count from meshopt_generateVertexRemapMulti
+ size_t stride; // Stride between continguous attributes
+ };
+ std::vector streams; // Contains data necessary for meshopt_generateVertexRemapMulti call
+ std::vector buffers; // Contains data necessary for meshopt_remapVertexBuffer calls.
+
+ {
+ static struct { size_t offs; size_t size; size_t stride; } ref_streams[] = {
+ { (U64) &mPositions - (U64) this, sizeof(float) * 3, sizeof(mPositions[0]) },
+ { (U64) &mNormals - (U64) this, sizeof(float) * 3, sizeof(mNormals[0]) }, // Subsection of mPositions allocation
+ { (U64) &mTexCoords - (U64) this, sizeof(float) * 2, sizeof(mTexCoords[0]) }, // Subsection of mPositions allocation
+ { (U64) &mTangents - (U64) this, sizeof(float) * 3, sizeof(mTangents[0]) },
+ { (U64) &mWeights - (U64) this, sizeof(float) * 3, sizeof(mWeights[0]) },
+ };
+
+ for (size_t i = 0; i < sizeof(ref_streams) / sizeof(ref_streams[0]); ++i)
+ {
+ void** ptr = reinterpret_cast((char*)this + ref_streams[i].offs);
+ if (*ptr)
+ {
+ streams.push_back({ *ptr, ref_streams[i].size, ref_streams[i].stride });
+ buffers.push_back({ ptr, nullptr, ref_streams[i].stride });
+ }
+ }
+ }
+
+ std::vector remap;
+ try
+ {
+ remap.reserve(mNumIndices);
+ }
+ catch (const std::bad_alloc&)
+ {
+ return false;
+ }
+
+ size_t total_vertices = meshopt_generateVertexRemapMulti(remap.data(), mIndices, mNumIndices, mNumVertices, streams.data(), streams.size());
+ meshopt_remapIndexBuffer(mIndices, mIndices, mNumIndices, remap.data());
+ bool failed = false;
+ for (auto& entry : buffers)
+ {
+ // Create scratch buffer for attribute data. Avoids extra allocs in meshopt_remapVertexBuffer calls
+ void* buf_tmp = ll_aligned_malloc_16(entry.stride * total_vertices);
+ if (!buf_tmp)
+ {
+ failed = true;
+ break;
+ }
+ entry.scratch = buf_tmp;
+ // Write to scratch buffer
+ meshopt_remapVertexBuffer(entry.scratch, *entry.dst, mNumVertices, entry.stride, remap.data());
+ }
+
+ if (failed)
+ {
+ for (auto& entry : buffers)
+ {
+ // Release scratch buffer
+ ll_aligned_free_16(entry.scratch);
+ }
+ return false;
+ }
+
+ if (mNumAllocatedVertices != total_vertices)
+ {
+ // New allocations will be transparently accessable through dereffing dest_buffers.
+ if (!allocateVertices(this, total_vertices))
+ {
+ for (auto& entry : buffers)
+ {
+ // Release scratch buffer
+ ll_aligned_free_16(entry.scratch);
+ }
+ allocateVertices(this, 0);
+ allocateWeights(0);
+ allocateTangents(0);
+ return false;
+ }
+
+ if (mWeights)
+ {
+ allocateWeights(total_vertices);
+ if(!mWeights)
+ {
+ for (auto& entry : buffers)
+ {
+ // Release scratch buffer
+ ll_aligned_free_16(entry.scratch);
+ }
+ allocateVertices(this, 0);
+ allocateWeights(0);
+ allocateTangents(0);
+ return false;
+ }
+ }
+
+ if (mTangents)
+ {
+ allocateTangents(total_vertices);
+ if(!mTangents)
+ {
+ for (auto& entry : buffers)
+ {
+ // Release scratch buffer
+ ll_aligned_free_16(entry.scratch);
+ }
+ allocateVertices(this, 0);
+ allocateWeights(0);
+ allocateTangents(0);
+ return false;
+ }
+ }
+ }
+
+ meshopt_optimizeVertexCache(mIndices, mIndices, mNumIndices, total_vertices);
+ //meshopt_optimizeOverdraw(mIndices, mIndices, mNumIndices, (float*)buffers[0].scratch, total_vertices, buffers[0].stride, 1.05f);
+ meshopt_optimizeVertexFetchRemap(remap.data(), mIndices, mNumIndices, total_vertices);
+ meshopt_remapIndexBuffer(mIndices, mIndices, mNumIndices, remap.data());
+ for (auto& entry : buffers)
+ {
+ // Write to llvolume attribute buffer
+ meshopt_remapVertexBuffer(*entry.dst, entry.scratch, total_vertices, entry.stride, remap.data());
+ // Release scratch buffer
+ ll_aligned_free_16(entry.scratch);
+ }
+ mNumVertices = total_vertices;
+
+ return true;
+}
+
+#else
+// Use Alchemy's vertex cache optimizer for Linux. Thank you!
bool LLVolumeFace::cacheOptimize()
{ //optimize for vertex cache according to Forsyth method:
@@ -5447,7 +5651,7 @@ bool LLVolumeFace::cacheOptimize()
// windows version.
//
-#ifndef LL_LINUX
+// #ifndef LL_LINUX // Use Alchemy's vertex cache optimizer for Linux. Thank you!
LLVCacheLRU cache;
if (mNumVertices < 3 || mNumIndices < 3)
@@ -5691,10 +5895,11 @@ bool LLVolumeFace::cacheOptimize()
//std::string result = llformat("ACMR pre/post: %.3f/%.3f -- %d triangles %d breaks", pre_acmr, post_acmr, mNumIndices/3, breaks);
//LL_INFOS() << result << LL_ENDL;
-#endif
+// #endif // Use Alchemy's vertex cache optimizer for Linux. Thank you!
return true;
}
+#endif // Use Alchemy's vertex cache optimizer for Linux. Thank you!
void LLVolumeFace::createOctree(F32 scaler, const LLVector4a& center, const LLVector4a& size)
{
@@ -6641,15 +6846,48 @@ void LLVolumeFace::pushVertex(const LLVector4a& pos, const LLVector4a& norm, con
void LLVolumeFace::allocateTangents(S32 num_verts)
{
+// Use Alchemy's vertex cache optimizer for Linux. Thank you!
+#ifdef LL_LINUX
+ ll_aligned_free_16(mTangents);
+ mTangents = nullptr;
+ if (num_verts)
+ {
+ mTangents = (LLVector4a*)ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
+ if (!mTangents)
+ {
+ LL_WARNS("LLVOLUME") << "Allocation of binormals[" << sizeof(LLVector4a)*num_verts << "] failed" << LL_ENDL;
+ return;
+ }
+ }
+ return;
+#else
+// Use Alchemy's vertex cache optimizer for Linux. Thank you!
ll_aligned_free_16(mTangents);
mTangents = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
+#endif // Use Alchemy's vertex cache optimizer for Linux. Thank you!
}
void LLVolumeFace::allocateWeights(S32 num_verts)
{
+// Use Alchemy's vertex cache optimizer for Linux. Thank you!
+#ifdef LL_LINUX
+ ll_aligned_free_16(mWeights);
+ mWeights = nullptr;
+ if (num_verts)
+ {
+ mWeights = (LLVector4a*)ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
+ if (!mWeights)
+ {
+ LL_WARNS("LLVOLUME") << "Allocation of weights[" << sizeof(LLVector4a) * num_verts << "] failed" << LL_ENDL;
+ return;
+ }
+ }
+ return;
+#else
+// Use Alchemy's vertex cache optimizer for Linux. Thank you!
ll_aligned_free_16(mWeights);
mWeights = (LLVector4a*)ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
-
+#endif // Use Alchemy's vertex cache optimizer for Linux. Thank you!
}
void LLVolumeFace::allocateJointIndices(S32 num_verts)