Closed
Description
Here is the Q_rsqrt()
code:
inline float Q_rsqrt( float number )
{
float x = 0.5f * number;
float y;
// compute approximate inverse square root
#if defined(DAEMON_USE_ARCH_INTRINSICS_i686_sse)
// SSE rsqrt relative error bound: 3.7 * 10^-4
_mm_store_ss( &y, _mm_rsqrt_ss( _mm_load_ss( &number ) ) );
#else
y = Util::bit_cast<float>( 0x5f3759df - ( Util::bit_cast<uint32_t>( number ) >> 1 ) );
y *= ( 1.5f - ( x * y * y ) ); // initial iteration
// relative error bound after the initial iteration: 1.8 * 10^-3
#endif
y *= ( 1.5f - ( x * y * y ) ); // second iteration for higher precision
return y;
}
If I comment out the second iteration, this way;
inline float Q_rsqrt( float number )
{
float x = 0.5f * number;
float y;
// compute approximate inverse square root
#if defined(DAEMON_USE_ARCH_INTRINSICS_i686_sse)
// SSE rsqrt relative error bound: 3.7 * 10^-4
_mm_store_ss( &y, _mm_rsqrt_ss( _mm_load_ss( &number ) ) );
#else
y = Util::bit_cast<float>( 0x5f3759df - ( Util::bit_cast<uint32_t>( number ) >> 1 ) );
y *= ( 1.5f - ( x * y * y ) ); // initial iteration
// relative error bound after the initial iteration: 1.8 * 10^-3
#endif
// y *= ( 1.5f - ( x * y * y ) ); // second iteration for higher precision
return y;
}
I jump from 8fps to 10fps (+25%) with r_VBOmodel 0
using the branch and the test layout (177 visible models) from:
and I see no visual difference.