Move fast memcpy to llcommon and use it in llalignedarray pushback on all platforms. Code Review: DaveP

2013-03-13 10:42:40 -07:00 · 2013-03-13 10:42:40 -07:00 · 5d2fea6262
parent ab60c46a91
commit 5d2fea6262
6 changed files with 120 additions and 105 deletions
--- a/indra/llcommon/llalignedarray.h
+++ b/indra/llcommon/llalignedarray.h
@ -29,10 +29,6 @@

 #include "llmemory.h"

-#if LL_WINDOWS
-#include "llvector4a.h" // for 16b fast copy
-#endif
-
 template <class T, U32 alignment>
 class LLAlignedArray
 {
@ -81,11 +77,7 @@ void LLAlignedArray<T, alignment>::push_back(const T& elem)
 		T* new_buf = (T*) ll_aligned_malloc(mCapacity*sizeof(T), alignment);
 		if (mArray)
 		{
-#if LL_WINDOWS
-			LLVector4a::memcpyNonAliased16((F32*) new_buf, (F32*) mArray, sizeof(T)*mElementCount);
-#else
-			memcpy((F32*)new_buf, (F32*)mArray, sizeof(T)*mElementCount);
-#endif
+			ll_memcpy_nonaliased_aligned_16((char*)new_buf, (char*)mArray, sizeof(T)*mElementCount);
 		}
 		old_buf = mArray;
 		mArray = new_buf;
@ -106,11 +98,7 @@ void LLAlignedArray<T, alignment>::resize(U32 size)
 		T* new_buf = mCapacity > 0 ? (T*) ll_aligned_malloc(mCapacity*sizeof(T), alignment) : NULL;
 		if (mArray)
 		{
-#if LL_WINDOWS
-			LLVector4a::memcpyNonAliased16((F32*) new_buf, (F32*) mArray, sizeof(T)*mElementCount);
-#else
-			memcpy((F32*) new_buf, (F32*) mArray, sizeof(T)*mElementCount);
-#endif
+			ll_memcpy_nonaliased_aligned_16((char*) new_buf, (char*) mArray, sizeof(T)*mElementCount);
 			ll_aligned_free(mArray);
 		}

--- a/indra/llcommon/llmemory.h
+++ b/indra/llcommon/llmemory.h
@ -36,6 +36,44 @@ class LLMutex ;
 #define LL_CHECK_MEMORY
 #endif

+LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment);
+
+#ifdef SHOW_ASSERT
+#define ll_assert_aligned(ptr,alignment) ll_assert_aligned_func(reinterpret_cast<uintptr_t>(ptr),((U32)alignment))
+#else
+#define ll_assert_aligned(ptr,alignment)
+#endif
+
+#include <xmmintrin.h>
+
+template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address) 
+{ 
+	return reinterpret_cast<T*>(
+		(reinterpret_cast<uintptr_t>(address) + 0xF) & ~0xF);
+}
+
+template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address) 
+{ 
+	return reinterpret_cast<T*>(
+		(reinterpret_cast<uintptr_t>(address) + 0x3F) & ~0x3F);
+}
+
+#if LL_LINUX || LL_DARWIN
+
+#define			LL_ALIGN_PREFIX(x)
+#define			LL_ALIGN_POSTFIX(x)		__attribute__((aligned(x)))
+
+#elif LL_WINDOWS
+
+#define			LL_ALIGN_PREFIX(x)		__declspec(align(x))
+#define			LL_ALIGN_POSTFIX(x)
+
+#else
+#error "LL_ALIGN_PREFIX and LL_ALIGN_POSTFIX undefined"
+#endif
+
+#define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16)
+
 inline void* ll_aligned_malloc( size_t size, int align )
 {
 #if defined(LL_WINDOWS)
@ -144,6 +182,78 @@ inline void ll_aligned_free_32(void *p)
 #endif
 }

+
+// Copy words 16-byte blocks from src to dst. Source and destination MUST NOT OVERLAP. 
+// Source and dest must be 16-byte aligned and size must be multiple of 16.
+//
+inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __restrict src, size_t bytes)
+{
+	assert(src != NULL);
+	assert(dst != NULL);
+	assert(bytes > 0);
+	assert((bytes % sizeof(F32))== 0); 
+	ll_assert_aligned(src,16);
+	ll_assert_aligned(dst,16);
+	assert((src < dst) ? ((src + bytes) < dst) : ((dst + bytes) < src));
+	assert(bytes%16==0);
+
+	char* end = dst + bytes;
+
+	if (bytes > 64)
+	{
+
+		// Find start of 64b aligned area within block
+		//
+		void* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst);
+		
+		//at least 64 bytes before the end of the destination, switch to 16 byte copies
+		void* end_64 = end-64;
+	
+		// Prefetch the head of the 64b area now
+		//
+		_mm_prefetch((char*)begin_64, _MM_HINT_NTA);
+		_mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA);
+		_mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA);
+		_mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA);
+	
+		// Copy 16b chunks until we're 64b aligned
+		//
+		while (dst < begin_64)
+		{
+
+			_mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
+			dst += 16;
+			src += 16;
+		}
+	
+		// Copy 64b chunks up to your tail
+		//
+		// might be good to shmoo the 512b prefetch offset
+		// (characterize performance for various values)
+		//
+		while (dst < end_64)
+		{
+			_mm_prefetch((char*)src + 512, _MM_HINT_NTA);
+			_mm_prefetch((char*)dst + 512, _MM_HINT_NTA);
+			_mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
+			_mm_store_ps((F32*)(dst + 16), _mm_load_ps((F32*)(src + 16)));
+			_mm_store_ps((F32*)(dst + 32), _mm_load_ps((F32*)(src + 32)));
+			_mm_store_ps((F32*)(dst + 48), _mm_load_ps((F32*)(src + 48)));
+			dst += 64;
+			src += 64;
+		}
+	}
+
+	// Copy remainder 16b tail chunks (or ALL 16b chunks for sub-64b copies)
+	//
+	while (dst < end)
+	{
+		_mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
+		dst += 16;
+		src += 16;
+	}
+}
+
 #ifndef __DEBUG_PRIVATE_MEM__
 #define __DEBUG_PRIVATE_MEM__  0
 #endif
@ -552,13 +662,7 @@ void  LLPrivateMemoryPoolTester::operator delete[](void* addr)

 // LLSingleton moved to llsingleton.h

-LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment);

-#ifdef SHOW_ASSERT
-#define ll_assert_aligned(ptr,alignment) ll_assert_aligned_func(reinterpret_cast<uintptr_t>(ptr),((U32)alignment))
-#else
-#define ll_assert_aligned(ptr,alignment)
-#endif


 #endif
--- a/indra/llmath/llsimdmath.h
+++ b/indra/llmath/llsimdmath.h
@ -39,34 +39,6 @@
 #include <stdint.h>
 #endif

-template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address) 
-{ 
-	return reinterpret_cast<T*>(
-		(reinterpret_cast<uintptr_t>(address) + 0xF) & ~0xF);
-}
-
-template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address) 
-{ 
-	return reinterpret_cast<T*>(
-		(reinterpret_cast<uintptr_t>(address) + 0x3F) & ~0x3F);
-}
-
-#if LL_LINUX || LL_DARWIN
-
-#define			LL_ALIGN_PREFIX(x)
-#define			LL_ALIGN_POSTFIX(x)		__attribute__((aligned(x)))
-
-#elif LL_WINDOWS
-
-#define			LL_ALIGN_PREFIX(x)		__declspec(align(x))
-#define			LL_ALIGN_POSTFIX(x)
-
-#else
-#error "LL_ALIGN_PREFIX and LL_ALIGN_POSTFIX undefined"
-#endif
-
-#define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16)
-
 #include <xmmintrin.h>
 #include <emmintrin.h>

--- a/indra/llmath/llvector4a.cpp
+++ b/indra/llmath/llvector4a.cpp
@ -41,55 +41,7 @@ extern const LLVector4a LL_V4A_EPSILON = reinterpret_cast<const LLVector4a&> ( F

 /*static */void LLVector4a::memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes)
 {
-	assert(src != NULL);
-	assert(dst != NULL);
-	assert(bytes > 0);
-	assert((bytes % sizeof(F32))== 0); 
-	ll_assert_aligned(src,16);
-	ll_assert_aligned(dst,16);
-	assert(bytes%16==0);
-
-	F32* end = dst + (bytes / sizeof(F32) );
-
-	if (bytes > 64)
-	{
-		F32* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst);
-		
-		//at least 64 (16*4) bytes before the end of the destination, switch to 16 byte copies
-		F32* end_64 = end-16;
-		
-		_mm_prefetch((char*)begin_64, _MM_HINT_NTA);
-		_mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA);
-		_mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA);
-		_mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA);
-		
-		while (dst < begin_64)
-		{
-			copy4a(dst, src);
-			dst += 4;
-			src += 4;
-		}
-		
-		while (dst < end_64)
-		{
-			_mm_prefetch((char*)src + 512, _MM_HINT_NTA);
-			_mm_prefetch((char*)dst + 512, _MM_HINT_NTA);
-			copy4a(dst, src);
-			copy4a(dst+4, src+4);
-			copy4a(dst+8, src+8);
-			copy4a(dst+12, src+12);
-			
-			dst += 16;
-			src += 16;
-		}
-	}
-
-	while (dst < end)
-	{
-		copy4a(dst, src);
-		dst += 4;
-		src += 4;
-	}
+        ll_memcpy_nonaliased_aligned_16((char*)dst, (char*)src, bytes);
 }

 void LLVector4a::setRotated( const LLRotation& rot, const LLVector4a& vec )
--- a/indra/llmath/llvector4a.h
+++ b/indra/llmath/llvector4a.h
@ -93,11 +93,7 @@ public:
 	
 	LLVector4a()
 	{ //DO NOT INITIALIZE -- The overhead is completely unnecessary
-// This assert is causing spurious referenced before set warnings on GCC 4.3.4
-//
-#if !LL_LINUX
 		ll_assert_aligned(this,16);
-#endif
 	}
 	
 	LLVector4a(F32 x, F32 y, F32 z, F32 w = 0.f)
--- a/indra/llmath/llvolume.cpp
+++ b/indra/llmath/llvolume.cpp
@ -4729,10 +4729,13 @@ void LLVolumeFace::optimize(F32 angle_cutoff)
 		}
 	}

-	llassert(new_face.mNumIndices == mNumIndices);
-	llassert(new_face.mNumVertices <= mNumVertices);
-
-	swapData(new_face);
+	// Only swap data if we've actually optimized the mesh
+	//
+	if (new_face.mNumVertices < mNumVertices)
+	{
+		llassert(new_face.mNumIndices == mNumIndices);
+		swapData(new_face);
+	}
 }

 class LLVCacheTriangleData;