Use SSE2NEON to emulate SSE intrinsics when building against an ARM target
parent
9fef2a114e
commit
4ab2a80e6c
|
|
@ -71,7 +71,11 @@ LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment);
|
|||
#define ll_assert_aligned(ptr,alignment)
|
||||
#endif
|
||||
|
||||
#if LL_ARM64
|
||||
#include "sse2neon.h"
|
||||
#else
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address)
|
||||
{
|
||||
|
|
@ -339,6 +343,9 @@ LL_FORCE_INLINE void ll_aligned_free(void* ptr)
|
|||
inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __restrict src, size_t bytes)
|
||||
{
|
||||
LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
|
||||
#if defined(LL_ARM64)
|
||||
memcpy(dst, src, bytes);
|
||||
#else
|
||||
assert(src != NULL);
|
||||
assert(dst != NULL);
|
||||
assert(bytes > 0);
|
||||
|
|
@ -404,6 +411,7 @@ inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __
|
|||
dst += 16;
|
||||
src += 16;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef __DEBUG_PRIVATE_MEM__
|
||||
|
|
|
|||
|
|
@ -31,16 +31,26 @@
|
|||
#error "Please include llmath.h before this file."
|
||||
#endif
|
||||
|
||||
#if ( ( LL_DARWIN || LL_LINUX ) && !(__SSE2__) ) || ( LL_WINDOWS && ( _M_IX86_FP < 2 && ADDRESS_SIZE == 32 ) )
|
||||
#error SSE2 not enabled. LLVector4a and related class will not compile.
|
||||
// the check for this error case must be split into multiple parts
|
||||
// because some versions of VS complain about '__SSE2__'
|
||||
#if ( ( LL_DARWIN || LL_LINUX ) )
|
||||
#if !(__SSE2__) && !(__arm64__) && !(__aarch64__)
|
||||
#error SSE2 not enabled. LLVector4a and related class will not compile.
|
||||
#endif
|
||||
#elif ( LL_WINDOWS && ( _M_IX86_FP < 2 && ADDRESS_SIZE == 32 ) )
|
||||
#error SSE2 not enabled. LLVector4a and related class will not compile.
|
||||
#endif
|
||||
|
||||
#if !LL_WINDOWS
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
#if defined(__arm64__) || defined(__aarch64__)
|
||||
#include "sse2neon.h"
|
||||
#else
|
||||
#include <xmmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#include "llmemory.h"
|
||||
#include "llsimdtypes.h"
|
||||
|
|
|
|||
|
|
@ -115,7 +115,7 @@ inline void LLVector4a::set(F32 x, F32 y, F32 z, F32 w)
|
|||
// Set to all zeros
|
||||
inline void LLVector4a::clear()
|
||||
{
|
||||
mQ = LLVector4a::getZero().mQ;
|
||||
mQ = _mm_setzero_ps();
|
||||
}
|
||||
|
||||
inline void LLVector4a::splat(const F32 x)
|
||||
|
|
@ -272,6 +272,9 @@ inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b)
|
|||
// Set all elements to the dot product of the x, y, and z elements in a and b
|
||||
inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
|
||||
{
|
||||
#if (defined(__arm64__) || defined(__aarch64__))
|
||||
mQ = _mm_dp_ps(a.mQ, b.mQ, 0x7f);
|
||||
#else
|
||||
// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
|
||||
const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
|
||||
// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
|
||||
|
|
@ -284,11 +287,15 @@ inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
|
|||
const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 ));
|
||||
// mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
|
||||
mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Set all elements to the dot product of the x, y, z, and w elements in a and b
|
||||
inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
|
||||
{
|
||||
#if (defined(__arm64__) || defined(__aarch64__))
|
||||
mQ = _mm_dp_ps(a.mQ, b.mQ, 0xff);
|
||||
#else
|
||||
// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
|
||||
const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
|
||||
// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
|
||||
|
|
@ -301,21 +308,29 @@ inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
|
|||
|
||||
// mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
|
||||
mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Return the 3D dot product of this vector and b
|
||||
inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const
|
||||
{
|
||||
#if (defined(__arm64__) || defined(__aarch64__))
|
||||
return _mm_dp_ps(mQ, b.mQ, 0x7f);
|
||||
#else
|
||||
const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
|
||||
const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) );
|
||||
const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) );
|
||||
const LLQuad xPlusY = _mm_add_ps( ab, splatY );
|
||||
return _mm_add_ps( xPlusY, splatZ );
|
||||
#endif
|
||||
}
|
||||
|
||||
// Return the 4D dot product of this vector and b
|
||||
inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
|
||||
{
|
||||
#if (defined(__arm64__) || defined(__aarch64__))
|
||||
return _mm_dp_ps(mQ, b.mQ, 0xff);
|
||||
#else
|
||||
// ab = { w, z, y, x }
|
||||
const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
|
||||
// upperProdsInLowerElems = { y, x, y, x }
|
||||
|
|
@ -325,6 +340,7 @@ inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
|
|||
// shuffled = { z+x, z+x, z+x, z+x }
|
||||
const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) );
|
||||
return _mm_add_ss( sumOfPairs, shuffled );
|
||||
#endif
|
||||
}
|
||||
|
||||
// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
|
||||
|
|
|
|||
|
|
@ -349,9 +349,9 @@ namespace tut
|
|||
ensure(
|
||||
"2. LLVector4 operator*(const LLVector4 &a, const LLQuaternion &rot) failed",
|
||||
is_approx_equal(-58153.5390f, result.mV[0]) &&
|
||||
(183787.8125f == result.mV[1]) &&
|
||||
(116864.164063f == result.mV[2]) &&
|
||||
(78.099998f == result.mV[3]));
|
||||
is_approx_equal(183787.8125f, result.mV[1]) &&
|
||||
is_approx_equal(116864.164063f, result.mV[2]) &&
|
||||
is_approx_equal(78.099998f, result.mV[3]));
|
||||
}
|
||||
|
||||
//test case for LLVector3 operator*(const LLVector3 &a, const LLQuaternion &rot) fn.
|
||||
|
|
|
|||
|
|
@ -972,6 +972,7 @@ bool LLAppViewer::init()
|
|||
return false;
|
||||
}
|
||||
|
||||
#if defined(LL_X86) || defined(LL_X86_64)
|
||||
// Without SSE2 support we will crash almost immediately, warn here.
|
||||
if (!gSysCPU.hasSSE2())
|
||||
{
|
||||
|
|
@ -983,6 +984,7 @@ bool LLAppViewer::init()
|
|||
// quit immediately
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
// alert the user if they are using unsupported hardware
|
||||
if (!gSavedSettings.getBOOL("AlertedUnsupportedHardware"))
|
||||
|
|
@ -1268,7 +1270,7 @@ void LLAppViewer::initMaxHeapSize()
|
|||
//------------------------------------------------------------------------------------------
|
||||
//currently SL is built under 32-bit setting, we set its max heap size no more than 1.6 GB.
|
||||
|
||||
#ifndef LL_X86_64
|
||||
#if !defined(LL_X86_64) && !defined(LL_ARM64)
|
||||
F32Gigabytes max_heap_size_gb = (F32Gigabytes)gSavedSettings.getF32("MaxHeapSize") ;
|
||||
#else
|
||||
F32Gigabytes max_heap_size_gb = (F32Gigabytes)gSavedSettings.getF32("MaxHeapSize64");
|
||||
|
|
@ -3246,6 +3248,11 @@ LLSD LLAppViewer::getViewerInfo() const
|
|||
info["VIEWER_VERSION_STR"] = versionInfo.getVersion();
|
||||
info["CHANNEL"] = versionInfo.getChannel();
|
||||
info["ADDRESS_SIZE"] = ADDRESS_SIZE;
|
||||
#if LL_ARM64
|
||||
info["ARCHITECTURE"] = "ARM";
|
||||
#else
|
||||
info["ARCHITECTURE"] = "x86";
|
||||
#endif
|
||||
std::string build_config = versionInfo.getBuildConfig();
|
||||
if (build_config != "Release")
|
||||
{
|
||||
|
|
@ -5538,7 +5545,9 @@ void LLAppViewer::forceErrorBreakpoint()
|
|||
#ifdef LL_WINDOWS
|
||||
DebugBreak();
|
||||
#else
|
||||
#if defined(LL_X86) || defined(LL_X86_64)
|
||||
asm ("int $3");
|
||||
#endif
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue