17#ifndef AGX_SIMD_VEC3_H
18#define AGX_SIMD_VEC3_H
66 Vec3T(
const __m128 _xyzw);
82 Vec3T scale(
const Vec3T& val)
const;
85 Vec3T scale(
const __m128 val)
const;
90 static Vec3T madd(
const Vec3T& a,
const Vec3T& b,
const Vec3T& c);
91 static Vec3T cross(
const Vec3T& lhs,
const Vec3T& rhs);
93 static agx::Real32 dot(
const Vec3T& lhs,
const Vec3T& rhs);
95 static agx::Real32 innerProduct(
const Vec3T& v01,
const Vec3T& v02,
const Vec3T& v11,
const Vec3T& v12);
125 Vec3T(
const __m256d _xyzw);
127 Vec3T(
const __m128d _xy,
const __m128d _zw);
144 Vec3T negate()
const;
145 Vec3T scale(
const Vec3T& val)
const;
147#if AGX_USE_SSE() && !AGX_USE_AVX()
148 Vec3T scale(
const __m128d val)
const;
153 static Vec3T madd(
const Vec3T& a,
const Vec3T& b,
const Vec3T& c);
154 static Vec3T cross(
const Vec3T& lhs,
const Vec3T& rhs);
155 static agx::Real64 dot(
const Vec3T& lhs,
const Vec3T& rhs);
158 static agx::Real64 innerProduct(
const Vec3T& v01,
const Vec3T& v02,
const Vec3T& v11,
const Vec3T& v12);
172 typedef Vec3T<agx::Real32>
Vec3f;
173 typedef Vec3T<agx::Real64>
Vec3d;
174 typedef Vec3T<agx::Real>
Vec3;
176 std::ostream&
operator << ( std::ostream& output,
const Vec3d& v );
177 std::ostream&
operator << ( std::ostream& output,
const Vec3f& v );
195 AGX_FORCE_INLINE Vec3T<agx::Real32>::Vec3T(
const __m128 _xyzw) : xyzw(_xyzw)
205 xyzw = _mm256_cvtpd_ps(_mm256_load_pd(vec.
ptr()));
207 __m128 lower = _mm_cvtpd_ps(_mm_load_pd(vec.
ptr()));
208 __m128 upper = _mm_cvtpd_ps(_mm_load_pd(vec.
ptr()+2));
209 xyzw = _mm_or_ps(lower, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(upper), 8)));
237 _mm_store_ps( tmp, xyzw );
248 _mm_store_ps( tmp, xyzw );
260 _mm_store_ps( tmp, xyzw );
270 _mm_store_ps(target.
ptr(), xyzw);
279 const __m128 SIGNMASK = _mm_set1_ps(-0.0f);
280 return Vec3T(_mm_andnot_ps(xyzw, SIGNMASK));
282 return Vec3T(std::abs(xyzw[0]),
291 const __m128 SIGNMASK = _mm_set1_ps(-0.0f);
292 return Vec3T(_mm_xor_ps(xyzw, SIGNMASK));
301 return Vec3T(_mm_mul_ps(xyzw, val.xyzw));
311 return Vec3T(_mm_mul_ps(xyzw, val));
318 return scale(_mm_set1_ps(val));
320 return Vec3T(xyzw * val);
327 return Vec3T( _mm_add_ps( a.xyzw, _mm_mul_ps( b.xyzw, c.xyzw ) ) );
336 const agx::UInt32 shuffle_3021 = _MM_SHUFFLE(3, 0, 2, 1);
337 const agx::UInt32 shuffle_3102 = _MM_SHUFFLE(3, 1, 0, 2);
339 const __m128 a = _mm_mul_ps(_mm_shuffle_ps(lhs.xyzw, lhs.xyzw, shuffle_3021), _mm_shuffle_ps(rhs.xyzw, rhs.xyzw, shuffle_3102));
340 const __m128 b = _mm_mul_ps(_mm_shuffle_ps(lhs.xyzw, lhs.xyzw, shuffle_3102), _mm_shuffle_ps(rhs.xyzw, rhs.xyzw, shuffle_3021));
344 _mm_store_ps( tmp, _mm_sub_ps(a,b) );
348 return Vec3T(_mm_sub_ps(a,b));
350 return Vec3T(lhs.xyzw ^ rhs.xyzw);
357 const int mask = 0x71;
359 _mm_store_ps( output, _mm_dp_ps(lhs.xyzw, rhs.xyzw, mask) );
365 _mm_store_ps( scalarLhs, lhs.xyzw );
366 _mm_store_ps( scalarRhs, rhs.xyzw );
368 return scalarLhs[0] * scalarRhs[0] + scalarLhs[1] * scalarRhs[1] + scalarLhs[2] * scalarRhs[2];
370 return lhs.xyzw * rhs.xyzw;
377 return dot(*
this, *
this);
388 return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(xyzw, xyzw, 0x71)));
390 return std::sqrt(length2());
399 AGX_FORCE_INLINE agx::Real32 Vec3T<agx::Real32>::innerProduct(
const Vec3T& v01,
const Vec3T& v02,
const Vec3T& v11,
const Vec3T& v12)
404 __m128 tmp1 = _mm_mul_ps( v01.xyzw, v11.xyzw );
405 __m128 tmp2 = _mm_mul_ps( v02.xyzw, v12.xyzw );
407 tmp1 = _mm_add_ps( tmp1, tmp2 );
409 _mm_store_ps( output, tmp1 );
411 return output[0]+output[1]+output[2];
413 return v01.xyzw * v11.xyzw + v02.xyzw * v12.xyzw;
433 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T(
const __m256d _xyzw) : xyzw(_xyzw)
455 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T(
const __m128d _xy,
const __m128d _zw) : xy(_xy), zw(_zw)
463 const __m128 lower = _mm_load_ps(vec.
ptr());
464 xy = _mm_cvtps_pd(lower);
466 const __m128 upper = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(lower), 8));
467 zw = _mm_cvtps_pd(upper);
494 _mm256_store_pd( tmp, xyzw );
498 _mm_store_pd( tmp, xy );
509 _mm256_store_pd( tmp, xyzw );
513 _mm_store_pd( tmp, xy );
524 _mm256_store_pd( tmp, xyzw );
528 _mm_store_pd( tmp, zw );
538 _mm256_store_pd(target.
ptr(), xyzw);
540 _mm_store_pd(target.
ptr() , xy);
541 _mm_store_pd(target.
ptr()+2, zw);
550 _mm_store_ps(target.
ptr(), _mm256_cvtpd_ps(xyzw));
552 __m128 lower = _mm_cvtpd_ps(xy);
553 __m128 upper = _mm_cvtpd_ps(zw);
554 __m128 result = _mm_or_ps(lower, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(upper), 8)));
555 _mm_store_ps(target.
ptr(), result);
557 target =
agx::Vec3f((
float)xyzw[0], (
float)xyzw[1], (
float)xyzw[2]);
564 const __m256d SIGNMASK = _mm256_set1_pd(-0.0);
565 return Vec3T(_mm256_andnot_pd(xyzw, SIGNMASK));
567 const __m128d SIGNMASK = _mm_set1_pd(-0.0);
568 return Vec3T(_mm_andnot_pd(xy, SIGNMASK), _mm_andnot_pd(zw, SIGNMASK));
570 return Vec3T(std::abs(xyzw[0]),
579 const __m256d SIGNMASK = _mm256_set1_pd(-0.0);
580 return Vec3T(_mm256_xor_pd(xyzw, SIGNMASK));
582 const __m128d SIGNMASK = _mm_set1_pd(-0.0);
583 return Vec3T(_mm_xor_pd(xy, SIGNMASK), _mm_xor_pd(zw, SIGNMASK));
592 return Vec3T(_mm256_mul_pd(xyzw, val.xyzw));
594 return Vec3T(_mm_mul_pd(xy, val.xy), _mm_mul_pd(zw, val.zw));
600#if AGX_USE_SSE() && !AGX_USE_AVX()
603 return Vec3T(_mm_mul_pd(xy, val), _mm_mul_pd(zw, val));
610 return scale(Vec3T(val));
612 return scale(_mm_set1_pd(val));
614 return Vec3T(xyzw * val);
621 return Vec3T( _mm256_add_pd( a.xyzw, _mm256_mul_pd( b.xyzw, c.xyzw ) ) );
623 const __m128d xy = _mm_add_pd( a.xy, _mm_mul_pd( b.xy, c.xy ) );
624 const __m128d zw = _mm_add_pd( a.zw, _mm_mul_pd( b.zw, c.zw ) );
626 return Vec3T(xy, zw);
636#error "This needs to be fixed!"
638 const agx::UInt32 shuffle_3021 = _MM_SHUFFLE(3, 0, 2, 1);
639 const agx::UInt32 shuffle_3102 = _MM_SHUFFLE(3, 1, 0, 2);
641 const __m256d a = _mm256_mul_pd(_mm256_shuffle_pd(lhs.xyzw, lhs.xyzw, shuffle_3021), _mm256_shuffle_pd(rhs.xyzw, rhs.xyzw, shuffle_3102));
642 const __m256d b = _mm256_mul_pd(_mm256_shuffle_pd(lhs.xyzw, lhs.xyzw, shuffle_3102), _mm256_shuffle_pd(rhs.xyzw, rhs.xyzw, shuffle_3021));
646 _mm256_store_pd( tmp, _mm256_sub_pd(a,b) );
650 return Vec3T(_mm256_sub_pd(a,b));
652 const __m128d SIGN_NP = _mm_set_pd ( 0.0 , -0.0 );
656 __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( lhs.zw , lhs.zw ) , rhs.xy );
659 __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd ( rhs.zw , rhs.zw ) , lhs.xy );
660 __m128d m1 = _mm_sub_pd ( l1 , l2 );
661 m1 = _mm_shuffle_pd ( m1 , m1 , 1 );
662 m1 = _mm_xor_pd ( m1 , SIGN_NP );
665 l1 = _mm_mul_pd ( lhs.xy , _mm_shuffle_pd ( rhs.xy , rhs.xy , 1 ) );
667 __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
670 m2 = _mm_move_sd(_mm_setzero_pd(), m2);
672 return Vec3T(m1, m2);
674 return Vec3T(lhs.xyzw ^ rhs.xyzw);
688 __m128d lhsXY = _mm256_extractf128_pd( lhs.xyzw, 0 );
689 __m128d lhsZW = _mm256_extractf128_pd( lhs.xyzw, 1 );
690 __m128d rhsXY = _mm256_extractf128_pd( rhs.xyzw, 0 );
691 __m128d rhsZW = _mm256_extractf128_pd( rhs.xyzw, 1 );
693 const int mask = 0x31;
696 const __m128d xyDot = _mm_dp_pd(lhsXY, rhsXY, mask);
697 const __m128d zDot = _mm_mul_pd(lhsZW, rhsZW);
699 _mm_store_pd( output, _mm_add_pd(xyDot, zDot));
703 __m256d a = _mm256_mul_pd( lhs.xyzw, rhs.xyzw );
704 __m256d b = _mm256_hadd_pd( a, a );
705 __m128d hi128 = _mm256_extractf128_pd( b, 1 );
706 __m128d dotproduct = _mm_add_pd( (__m128d)b, hi128 );
709 _mm_store_pd( output, dotproduct );
714 const int mask = 0x31;
717 const __m128d xyDot = _mm_dp_pd(lhs.xy, rhs.xy, mask);
718 const __m128d zDot = _mm_mul_pd(lhs.zw, rhs.zw);
720 _mm_store_pd( output, _mm_add_pd(xyDot, zDot));
726 _mm_store_pd( scalarLhs, lhs.xy );
727 _mm_store_pd( scalarLhs + 2, lhs.zw );
728 _mm_store_pd( scalarRhs, rhs.xy );
729 _mm_store_pd( scalarRhs + 2, rhs.zw );
731 return scalarLhs[0] * scalarRhs[0] + scalarLhs[1] * scalarRhs[1] + scalarLhs[2] * scalarRhs[2];
733 return lhs.xyzw * rhs.xyzw;
739 return dot(*
this, *
this);
744#if AGX_USE_AVX() && 0
747 _mm256_store_pd(output, _mm256_sqrt_pd(_mm256_dp_pd(xyzw, xyzw, 0x71)));
750 return std::sqrt(this->length2());
757 AGX_FORCE_INLINE agx::Real64 Vec3T<agx::Real64>::innerProduct(
const Vec3T& v01,
const Vec3T& v02,
const Vec3T& v11,
const Vec3T& v12)
763 __m256d tmp1 = _mm256_mul_pd( v01.xyzw, v11.xyzw );
764 __m256d tmp2 = _mm256_mul_pd( v02.xyzw, v12.xyzw );
766 tmp1 = _mm256_add_pd( tmp1, tmp2 );
768 _mm256_store_pd( output, tmp1 );
770 return output[0]+output[1]+output[2];
780 __m128d tmp1 = _mm_mul_pd( v01.xy, v11.xy );
781 __m128d tmp2 = _mm_mul_pd( v01.zw, v11.zw );
783 __m128d tmp3 = _mm_mul_pd( v02.xy, v12.xy );
784 __m128d tmp4 = _mm_mul_pd( v02.zw, v12.zw );
787 tmp1 = _mm_add_pd( tmp1, tmp2 );
788 tmp3 = _mm_add_pd( tmp3, tmp4 );
792 tmp1 = _mm_hadd_pd( tmp1, tmp3 );
793 _mm_store_pd( output, tmp1 );
795 return output[0] + output[1];
797 return v01.xyzw * v11.xyzw + v02.xyzw * v12.xyzw;
807 return output << tmp;
814 return output << tmp;
std::ostream & operator<<(std::ostream &o, const agx::Vec6 &v)
static Vec3T mul(const Vec3T &lhs, const Vec3T &rhs)
Element-wise-multiplication.
#define DOXYGEN_END_INTERNAL_BLOCK()
#define DOXYGEN_START_INTERNAL_BLOCK()
#define AGX_ALIGNED(t, a)
The agx namespace contains the dynamics/math part of the AGX Dynamics API.
T absolute(T v)
return the absolute value.
Vec3T< Real > Vec3
The object holding 3 dimensional vectors and providing basic arithmetic.