AGX Dynamics 2.40.0.0
Loading...
Searching...
No Matches
agxSIMD/Vec3.h
Go to the documentation of this file.
1/*
2Copyright 2007-2025. Algoryx Simulation AB.
3
4All AGX source code, intellectual property, documentation, sample code,
5tutorials, scene files and technical white papers, are copyrighted, proprietary
6and confidential material of Algoryx Simulation AB. You may not download, read,
7store, distribute, publish, copy or otherwise disseminate, use or expose this
8material unless having a written signed agreement with Algoryx Simulation AB, or having been
9advised so by Algoryx Simulation AB for a time limited evaluation, or having purchased a
10valid commercial license from Algoryx Simulation AB.
11
12Algoryx Simulation AB disclaims all responsibilities for loss or damage caused
13from using this software, unless otherwise stated in written agreements with
14Algoryx Simulation AB.
15*/
16
17#ifndef AGX_SIMD_VEC3_H
18#define AGX_SIMD_VEC3_H
19
20
24#include <agx/Integer.h>
25#include <agx/Real.h>
26#include <agx/Vec3.h>
27#include <iosfwd>
28
29#if AGX_USE_SSE()
30#include <pmmintrin.h>
31#endif
32
33#if AGX_USE_SSE4()
34#include <smmintrin.h>
35#endif
36
37#if AGX_USE_AVX()
38#include <immintrin.h>
39#endif
40
42namespace agxSIMD
43{
44 // Vec3 base class, explicit specializations for Real32 and Real64
45 template <typename T>
46 class Vec3T
47 {
48 };
49
50 // 32bit real specialization
51 template <>
52 class Vec3T<agx::Real32>
53 {
54 public:
55 typedef agx::Real32 RealT;
56 typedef agx::Real32 Type;
57 static const agx::UInt ALIGNMENT = 16;
58
59 public:
60
61 Vec3T();
62 Vec3T(agx::Real32 val);
63 Vec3T(agx::Real32 x, agx::Real32 y, agx::Real32 z);
64
65#if AGX_USE_SSE()
66 Vec3T(const __m128 _xyzw);
67#endif
68
69 explicit Vec3T(const agx::Vec3f& vec);
70 explicit Vec3T(const agx::Vec3d&);
71
72 agx::Real32 x() const;
73 agx::Real32 y() const;
74 agx::Real32 z() const;
75
76 agx::Real32 length() const;
77 agx::Real32 length2() const;
78
79 void store(agx::Vec3f& target) const;
80 Vec3T absolute() const;
81 Vec3T negate() const;
82 Vec3T scale(const Vec3T& val) const;
83
84#if AGX_USE_SSE()
85 Vec3T scale(const __m128 val) const;
86#endif
87
88 Vec3T scale(agx::Real32 val) const;
89
90 static Vec3T madd(const Vec3T& a, const Vec3T& b, const Vec3T& c);
91 static Vec3T cross(const Vec3T& lhs, const Vec3T& rhs);
92
93 static agx::Real32 dot(const Vec3T& lhs, const Vec3T& rhs);
94
95 static agx::Real32 innerProduct(const Vec3T& v01, const Vec3T& v02, const Vec3T& v11, const Vec3T& v12);
96
97 public:
98#if AGX_USE_SSE()
99 __m128 xyzw;
100#else
101 agx::Vec3f xyzw;
102#endif
103 };
104
105 // 64bit real specialization
106 template <>
107 class Vec3T<agx::Real64>
108 {
109 public:
110 typedef agx::Real64 RealT;
111 typedef agx::Real64 Type;
112
113#if AGX_USE_AVX()
114 static const agx::UInt ALIGNMENT = 32;
115#else
116 static const agx::UInt ALIGNMENT = 16;
117#endif
118
119 public:
120 Vec3T();
121 Vec3T(agx::Real64 val);
122 Vec3T(agx::Real64 x, agx::Real64 y, agx::Real64 z);
123
124#if AGX_USE_AVX()
125 Vec3T(const __m256d _xyzw);
126#elif AGX_USE_SSE()
127 Vec3T(const __m128d _xy, const __m128d _zw);
128#endif
129
130 explicit Vec3T(const agx::Vec3d& vec);
131 explicit Vec3T(const agx::Vec3f& vec);
132
133 agx::Real64 x() const;
134 agx::Real64 y() const;
135 agx::Real64 z() const;
136
137 agx::Real64 length() const;
138 agx::Real64 length2() const;
139
140 void store(agx::Vec3d& target) const;
141 void store(agx::Vec3f& target) const;
142
143 Vec3T absolute() const;
144 Vec3T negate() const;
145 Vec3T scale(const Vec3T& val) const;
146
147#if AGX_USE_SSE() && !AGX_USE_AVX()
148 Vec3T scale(const __m128d val) const;
149#endif
150
151 Vec3T scale(agx::Real64 val) const;
152
153 static Vec3T madd(const Vec3T& a, const Vec3T& b, const Vec3T& c);
154 static Vec3T cross(const Vec3T& lhs, const Vec3T& rhs);
155 static agx::Real64 dot(const Vec3T& lhs, const Vec3T& rhs);
156
157
158 static agx::Real64 innerProduct(const Vec3T& v01, const Vec3T& v02, const Vec3T& v11, const Vec3T& v12);
159
160 public:
161#if AGX_USE_AVX()
162 __m256d xyzw;
163#elif AGX_USE_SSE()
164 __m128d xy;
165 __m128d zw;
166#else
167 agx::Vec3d xyzw;
168#endif
169 };
170
171
172 typedef Vec3T<agx::Real32> Vec3f;
173 typedef Vec3T<agx::Real64> Vec3d;
174 typedef Vec3T<agx::Real> Vec3;
175
176 std::ostream& operator << ( std::ostream& output, const Vec3d& v );
177 std::ostream& operator << ( std::ostream& output, const Vec3f& v );
178
179
180
181 /* Implementation */
182
183 AGX_FORCE_INLINE Vec3T<agx::Real32>::Vec3T() : Vec3T(0.0f)
184 {}
185
186#if AGX_USE_SSE()
187 AGX_FORCE_INLINE Vec3T<agx::Real32>::Vec3T(agx::Real32 val) : xyzw(_mm_setr_ps(val, val, val, 0))
188 {
189 }
190
191 AGX_FORCE_INLINE Vec3T<agx::Real32>::Vec3T(agx::Real32 x, agx::Real32 y, agx::Real32 z) : xyzw(_mm_setr_ps(x, y, z, 0))
192 {
193 }
194
195 AGX_FORCE_INLINE Vec3T<agx::Real32>::Vec3T(const __m128 _xyzw) : xyzw(_xyzw)
196 {}
197
198 AGX_FORCE_INLINE Vec3T<agx::Real32>::Vec3T(const agx::Vec3f& vec) : xyzw(_mm_load_ps(vec.ptr()))
199 {
200 }
201
202 AGX_FORCE_INLINE Vec3T<agx::Real32>::Vec3T(const agx::Vec3d& vec)
203 {
204 #if AGX_USE_AVX()
205 xyzw = _mm256_cvtpd_ps(_mm256_load_pd(vec.ptr()));
206 #else
207 __m128 lower = _mm_cvtpd_ps(_mm_load_pd(vec.ptr()));
208 __m128 upper = _mm_cvtpd_ps(_mm_load_pd(vec.ptr()+2));
209 xyzw = _mm_or_ps(lower, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(upper), 8)));
210 #endif
211 }
212
213#else
214
215 AGX_FORCE_INLINE Vec3T<agx::Real32>::Vec3T(agx::Real32 val) : xyzw(val)
216 {
217 }
218
219 AGX_FORCE_INLINE Vec3T<agx::Real32>::Vec3T(agx::Real32 x, agx::Real32 y, agx::Real32 z) : xyzw(x, y, z)
220 {
221 }
222
223 AGX_FORCE_INLINE Vec3T<agx::Real32>::Vec3T(const agx::Vec3f& vec) : xyzw(vec)
224 {
225 }
226
227 AGX_FORCE_INLINE Vec3T<agx::Real32>::Vec3T(const agx::Vec3d& vec) : xyzw(vec)
228 {
229 }
230#endif
231
232
233 AGX_FORCE_INLINE agx::Real32 Vec3T<agx::Real32>::x() const
234 {
235#if AGX_USE_SSE()
236 AGX_ALIGNED( agx::Real32, 16 ) tmp[4];
237 _mm_store_ps( tmp, xyzw );
238 return tmp[0];
239#else
240 return xyzw.x();
241#endif
242 }
243
244 AGX_FORCE_INLINE agx::Real32 Vec3T<agx::Real32>::y() const
245 {
246#if AGX_USE_SSE()
247 AGX_ALIGNED( agx::Real32, 16 ) tmp[4];
248 _mm_store_ps( tmp, xyzw );
249 return tmp[1];
250#else
251 return xyzw.y();
252#endif
253 }
254
255
256 AGX_FORCE_INLINE agx::Real32 Vec3T<agx::Real32>::z() const
257 {
258#if AGX_USE_SSE()
259 AGX_ALIGNED( agx::Real32, 16 ) tmp[4];
260 _mm_store_ps( tmp, xyzw );
261 return tmp[2];
262#else
263 return xyzw.z();
264#endif
265 }
266
267 AGX_FORCE_INLINE void Vec3T<agx::Real32>::store(agx::Vec3f& target) const
268 {
269#if AGX_USE_SSE()
270 _mm_store_ps(target.ptr(), xyzw);
271#else
272 target = xyzw;
273#endif
274 }
275
276 AGX_FORCE_INLINE Vec3f Vec3T<agx::Real32>::absolute() const
277 {
278#if AGX_USE_SSE()
279 const __m128 SIGNMASK = _mm_set1_ps(-0.0f); // -0.0f = 1 << 31 // _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
280 return Vec3T(_mm_andnot_ps(xyzw, SIGNMASK));
281#else
282 return Vec3T(std::abs(xyzw[0]),
283 std::abs(xyzw[1]),
284 std::abs(xyzw[2]));
285#endif
286 }
287
288 AGX_FORCE_INLINE Vec3f Vec3T<agx::Real32>::negate() const
289 {
290#if AGX_USE_SSE()
291 const __m128 SIGNMASK = _mm_set1_ps(-0.0f); // -0.0f = 1 << 31 // _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
292 return Vec3T(_mm_xor_ps(xyzw, SIGNMASK));
293#else
294 return Vec3T(-xyzw);
295#endif
296 }
297
298 AGX_FORCE_INLINE Vec3f Vec3T<agx::Real32>::scale(const Vec3T& val) const
299 {
300#if AGX_USE_SSE()
301 return Vec3T(_mm_mul_ps(xyzw, val.xyzw));
302#else
303 return Vec3T(agx::Vec3f::mul(xyzw, val.xyzw));
304#endif
305 }
306
307
308#if AGX_USE_SSE()
309 AGX_FORCE_INLINE Vec3f Vec3T<agx::Real32>::scale(const __m128 val) const
310 {
311 return Vec3T(_mm_mul_ps(xyzw, val));
312 }
313#endif
314
315 AGX_FORCE_INLINE Vec3f Vec3T<agx::Real32>::scale(agx::Real32 val) const
316 {
317#if AGX_USE_SSE()
318 return scale(_mm_set1_ps(val));
319#else
320 return Vec3T(xyzw * val);
321#endif
322 }
323
324 AGX_FORCE_INLINE Vec3f Vec3T<agx::Real32>::madd(const Vec3T& a, const Vec3T& b, const Vec3T& c)
325 {
326#if AGX_USE_SSE()
327 return Vec3T( _mm_add_ps( a.xyzw, _mm_mul_ps( b.xyzw, c.xyzw ) ) );
328#else
329 return Vec3T(a.xyzw + agx::Vec3f::mul(b.xyzw, c.xyzw));
330#endif
331 }
332
333 AGX_FORCE_INLINE Vec3f Vec3T<agx::Real32>::cross(const Vec3T& lhs, const Vec3T& rhs)
334 {
335#if AGX_USE_SSE()
336 const agx::UInt32 shuffle_3021 = _MM_SHUFFLE(3, 0, 2, 1);
337 const agx::UInt32 shuffle_3102 = _MM_SHUFFLE(3, 1, 0, 2);
338
339 const __m128 a = _mm_mul_ps(_mm_shuffle_ps(lhs.xyzw, lhs.xyzw, shuffle_3021), _mm_shuffle_ps(rhs.xyzw, rhs.xyzw, shuffle_3102));
340 const __m128 b = _mm_mul_ps(_mm_shuffle_ps(lhs.xyzw, lhs.xyzw, shuffle_3102), _mm_shuffle_ps(rhs.xyzw, rhs.xyzw, shuffle_3021));
341
342 #ifdef AGX_DEBUG
343 AGX_ALIGNED( agx::Real32, 16 ) tmp[4];
344 _mm_store_ps( tmp, _mm_sub_ps(a,b) );
345 agxAssert(tmp[3] == 0);
346 #endif
347
348 return Vec3T(_mm_sub_ps(a,b));
349#else
350 return Vec3T(lhs.xyzw ^ rhs.xyzw);
351#endif
352 }
353
354 AGX_FORCE_INLINE agx::Real32 Vec3T<agx::Real32>::dot(const Vec3T& lhs, const Vec3T& rhs)
355 {
356#if AGX_USE_SSE4()
357 const int mask = 0x71;
358 AGX_ALIGNED( agx::Real32, 16 ) output[4];
359 _mm_store_ps( output, _mm_dp_ps(lhs.xyzw, rhs.xyzw, mask) );
360 return output[0];
361#elif AGX_USE_SSE()
362 AGX_ALIGNED( agx::Real32, 16 ) scalarLhs[4];
363 AGX_ALIGNED( agx::Real32, 16 ) scalarRhs[4];
364
365 _mm_store_ps( scalarLhs, lhs.xyzw );
366 _mm_store_ps( scalarRhs, rhs.xyzw );
367
368 return scalarLhs[0] * scalarRhs[0] + scalarLhs[1] * scalarRhs[1] + scalarLhs[2] * scalarRhs[2];
369#else
370 return lhs.xyzw * rhs.xyzw;
371#endif
372 }
373
374 AGX_FORCE_INLINE agx::Real32 Vec3T<agx::Real32>::length2() const
375 {
376#if AGX_USE_SSE()
377 return dot(*this, *this);
378#else
379 return (agx::Real32)xyzw.length2();
380#endif
381 }
382
383 AGX_FORCE_INLINE agx::Real32 Vec3T<agx::Real32>::length() const
384 {
385#if AGX_USE_SSE()
386
387 #if AGX_USE_SSE4()
388 return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(xyzw, xyzw, 0x71)));
389 #else
390 return std::sqrt(length2());
391 #endif
392
393#else
394 return (agx::Real32)xyzw.length();
395#endif
396 }
397
398
399 AGX_FORCE_INLINE agx::Real32 Vec3T<agx::Real32>::innerProduct(const Vec3T& v01, const Vec3T& v02, const Vec3T& v11, const Vec3T& v12)
400 {
401#if AGX_USE_SSE()
402 AGX_ALIGNED( agx::Real32, 16 ) output[4];
403
404 __m128 tmp1 = _mm_mul_ps( v01.xyzw, v11.xyzw );
405 __m128 tmp2 = _mm_mul_ps( v02.xyzw, v12.xyzw );
406
407 tmp1 = _mm_add_ps( tmp1, tmp2 );
408
409 _mm_store_ps( output, tmp1 );
410
411 return output[0]+output[1]+output[2];
412#else
413 return v01.xyzw * v11.xyzw + v02.xyzw * v12.xyzw;
414#endif
415 }
416
417
418
420
421#if AGX_USE_AVX()
422
423 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T() : Vec3T(0.0)
424 {}
425
426 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T(agx::Real64 val) : xyzw(_mm256_setr_pd(val, val, val, 0))
427 {}
428
429 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T(agx::Real64 x, agx::Real64 y, agx::Real64 z) : xyzw(_mm256_setr_pd(x, y, z, 0))
430 {
431 }
432
433 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T(const __m256d _xyzw) : xyzw(_xyzw)
434 {}
435
436 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T(const agx::Vec3d& vec) : xyzw(_mm256_load_pd(vec.ptr()))
437 {}
438
439 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T(const agx::Vec3f& vec) : xyzw(_mm256_cvtps_pd(_mm_load_ps(vec.ptr())))
440 {}
441
442#elif AGX_USE_SSE()
443
444 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T() : Vec3T(0.0)
445 {}
446
447 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T(agx::Real64 val) : xy(_mm_set1_pd(val)), zw(_mm_setr_pd(val, 0))
448 {}
449
450 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T(agx::Real64 x, agx::Real64 y, agx::Real64 z) : xy(_mm_setr_pd(x, y)), zw(_mm_setr_pd(z, 0))
451 {
452 }
453
454
455 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T(const __m128d _xy, const __m128d _zw) : xy(_xy), zw(_zw)
456 {}
457
458 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T(const agx::Vec3d& vec) : xy(_mm_load_pd(vec.ptr())), zw(_mm_load_pd(vec.ptr()+2))
459 {}
460
461 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T(const agx::Vec3f& vec)
462 {
463 const __m128 lower = _mm_load_ps(vec.ptr());
464 xy = _mm_cvtps_pd(lower);
465
466 const __m128 upper = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(lower), 8));
467 zw = _mm_cvtps_pd(upper);
468 }
469
470#else
471
472 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T() : Vec3T(0.0)
473 {}
474
475 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T(agx::Real64 val) : xyzw(val)
476 {}
477
478 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T(agx::Real64 x, agx::Real64 y, agx::Real64 z) : xyzw(x, y, z)
479 {
480 }
481
482 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T(const agx::Vec3d& vec) : xyzw(vec)
483 {}
484
485 AGX_FORCE_INLINE Vec3T<agx::Real64>::Vec3T(const agx::Vec3f& vec) : xyzw(vec)
486 {}
487
488#endif
489
490 AGX_FORCE_INLINE agx::Real64 Vec3T<agx::Real64>::x() const
491 {
492#if AGX_USE_AVX()
493 AGX_ALIGNED( agx::Real64, 32 ) tmp[4];
494 _mm256_store_pd( tmp, xyzw );
495 return tmp[0];
496#elif AGX_USE_SSE()
497 AGX_ALIGNED( agx::Real64, 16 ) tmp[2];
498 _mm_store_pd( tmp, xy );
499 return tmp[0];
500#else
501 return xyzw.x();
502#endif
503 }
504
505 AGX_FORCE_INLINE agx::Real64 Vec3T<agx::Real64>::y() const
506 {
507#if AGX_USE_AVX()
508 AGX_ALIGNED( agx::Real64, 32 ) tmp[4];
509 _mm256_store_pd( tmp, xyzw );
510 return tmp[1];
511#elif AGX_USE_SSE()
512 AGX_ALIGNED( agx::Real64, 16 ) tmp[2];
513 _mm_store_pd( tmp, xy );
514 return tmp[1];
515#else
516 return xyzw.y();
517#endif
518 }
519
520 AGX_FORCE_INLINE agx::Real64 Vec3T<agx::Real64>::z() const
521 {
522#if AGX_USE_AVX()
523 AGX_ALIGNED( agx::Real64, 32 ) tmp[4];
524 _mm256_store_pd( tmp, xyzw );
525 return tmp[2];
526#elif AGX_USE_SSE()
527 AGX_ALIGNED( agx::Real64, 16 ) tmp[2];
528 _mm_store_pd( tmp, zw );
529 return tmp[0];
530#else
531 return xyzw.z();
532#endif
533 }
534
535 AGX_FORCE_INLINE void Vec3T<agx::Real64>::store(agx::Vec3d& target) const
536 {
537#if AGX_USE_AVX()
538 _mm256_store_pd(target.ptr(), xyzw);
539#elif AGX_USE_SSE()
540 _mm_store_pd(target.ptr() , xy);
541 _mm_store_pd(target.ptr()+2, zw);
542#else
543 target = xyzw;
544#endif
545 }
546
547 AGX_FORCE_INLINE void Vec3T<agx::Real64>::store(agx::Vec3f& target) const
548 {
549#if AGX_USE_AVX()
550 _mm_store_ps(target.ptr(), _mm256_cvtpd_ps(xyzw));
551#elif AGX_USE_SSE()
552 __m128 lower = _mm_cvtpd_ps(xy);
553 __m128 upper = _mm_cvtpd_ps(zw);
554 __m128 result = _mm_or_ps(lower, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(upper), 8)));
555 _mm_store_ps(target.ptr(), result);
556#else
557 target = agx::Vec3f((float)xyzw[0], (float)xyzw[1], (float)xyzw[2]);
558#endif
559 }
560
561 AGX_FORCE_INLINE Vec3d Vec3T<agx::Real64>::absolute() const
562 {
563#if AGX_USE_AVX()
564 const __m256d SIGNMASK = _mm256_set1_pd(-0.0); // -0.0 = 1 << 63
565 return Vec3T(_mm256_andnot_pd(xyzw, SIGNMASK));
566#elif AGX_USE_SSE()
567 const __m128d SIGNMASK = _mm_set1_pd(-0.0); // -0.0 = 1 << 63
568 return Vec3T(_mm_andnot_pd(xy, SIGNMASK), _mm_andnot_pd(zw, SIGNMASK));
569#else
570 return Vec3T(std::abs(xyzw[0]),
571 std::abs(xyzw[1]),
572 std::abs(xyzw[2]));
573#endif
574 }
575
576 AGX_FORCE_INLINE Vec3d Vec3T<agx::Real64>::negate() const
577 {
578#if AGX_USE_AVX()
579 const __m256d SIGNMASK = _mm256_set1_pd(-0.0); // -0.0 = 1 << 63
580 return Vec3T(_mm256_xor_pd(xyzw, SIGNMASK));
581#elif AGX_USE_SSE()
582 const __m128d SIGNMASK = _mm_set1_pd(-0.0); // -0.0 = 1 << 63
583 return Vec3T(_mm_xor_pd(xy, SIGNMASK), _mm_xor_pd(zw, SIGNMASK));
584#else
585 return Vec3T(-xyzw);
586#endif
587 }
588
589 AGX_FORCE_INLINE Vec3d Vec3T<agx::Real64>::scale(const Vec3T& val) const
590 {
591#if AGX_USE_AVX()
592 return Vec3T(_mm256_mul_pd(xyzw, val.xyzw));
593#elif AGX_USE_SSE()
594 return Vec3T(_mm_mul_pd(xy, val.xy), _mm_mul_pd(zw, val.zw));
595#else
596 return Vec3T(agx::Vec3d::mul(xyzw, val.xyzw));
597#endif
598 }
599
600#if AGX_USE_SSE() && !AGX_USE_AVX()
601 AGX_FORCE_INLINE Vec3d Vec3T<agx::Real64>::scale(const __m128d val) const
602 {
603 return Vec3T(_mm_mul_pd(xy, val), _mm_mul_pd(zw, val));
604 }
605#endif
606
607 AGX_FORCE_INLINE Vec3d Vec3T<agx::Real64>::scale(agx::Real64 val) const
608 {
609#if AGX_USE_AVX()
610 return scale(Vec3T(val));
611#elif AGX_USE_SSE()
612 return scale(_mm_set1_pd(val));
613#else
614 return Vec3T(xyzw * val);
615#endif
616 }
617
618 AGX_FORCE_INLINE Vec3d Vec3T<agx::Real64>::madd(const Vec3T& a, const Vec3T& b, const Vec3T& c)
619 {
620#if AGX_USE_AVX()
621 return Vec3T( _mm256_add_pd( a.xyzw, _mm256_mul_pd( b.xyzw, c.xyzw ) ) );
622#elif AGX_USE_SSE()
623 const __m128d xy = _mm_add_pd( a.xy, _mm_mul_pd( b.xy, c.xy ) );
624 const __m128d zw = _mm_add_pd( a.zw, _mm_mul_pd( b.zw, c.zw ) );
625
626 return Vec3T(xy, zw);
627#else
628 return Vec3T(a.xyzw + agx::Vec3d::mul(b.xyzw, c.xyzw));
629#endif
630 }
631
632
633 AGX_FORCE_INLINE Vec3d Vec3T<agx::Real64>::cross(const Vec3T& lhs, const Vec3T& rhs)
634 {
635#if AGX_USE_AVX()
636#error "This needs to be fixed!"
637 // NOTE: Disabled, permute256 semantics are weird :/
638 const agx::UInt32 shuffle_3021 = _MM_SHUFFLE(3, 0, 2, 1);
639 const agx::UInt32 shuffle_3102 = _MM_SHUFFLE(3, 1, 0, 2);
640
641 const __m256d a = _mm256_mul_pd(_mm256_shuffle_pd(lhs.xyzw, lhs.xyzw, shuffle_3021), _mm256_shuffle_pd(rhs.xyzw, rhs.xyzw, shuffle_3102));
642 const __m256d b = _mm256_mul_pd(_mm256_shuffle_pd(lhs.xyzw, lhs.xyzw, shuffle_3102), _mm256_shuffle_pd(rhs.xyzw, rhs.xyzw, shuffle_3021));
643
644 #ifdef AGX_DEBUG
645 AGX_ALIGNED( agx::Real64, 32 ) tmp[4];
646 _mm256_store_pd( tmp, _mm256_sub_pd(a,b) );
647 agxAssert(tmp[3] == 0);
648 #endif
649
650 return Vec3T(_mm256_sub_pd(a,b));
651#elif AGX_USE_SSE()
652 const __m128d SIGN_NP = _mm_set_pd ( 0.0 , -0.0 );
653
654
655 // lhs.z * rhs.x, lhs.z * rhs.y
656 __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( lhs.zw , lhs.zw ) , rhs.xy );
657
658 // rhs.z * lhs.x, rhs.z * lhs.y
659 __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd ( rhs.zw , rhs.zw ) , lhs.xy );
660 __m128d m1 = _mm_sub_pd ( l1 , l2 ); // l1 - l2
661 m1 = _mm_shuffle_pd ( m1 , m1 , 1 ); // switch the elements
662 m1 = _mm_xor_pd ( m1 , SIGN_NP ); // change the sign of the first element
663
664 // lhs.x * rhs.y, lhs.y * rhs.x
665 l1 = _mm_mul_pd ( lhs.xy , _mm_shuffle_pd ( rhs.xy , rhs.xy , 1 ) );
666 // lhs.x * rhs.y - lhs.y * rhs.x
667 __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
668
669 // Clear w-component
670 m2 = _mm_move_sd(_mm_setzero_pd(), m2);
671
672 return Vec3T(m1, m2);
673#else
674 return Vec3T(lhs.xyzw ^ rhs.xyzw);
675#endif
676 }
677
678 AGX_FORCE_INLINE agx::Real64 Vec3T<agx::Real64>::dot(const Vec3T& lhs, const Vec3T& rhs)
679 {
680#if AGX_USE_AVX()
681 // NOTE: No _mm256_dp_pd instruction available!
682 // const int mask = 0x71;
683 // AGX_ALIGNED( agx::Real64, 32 ) output[4];
684 // _mm256_store_pd( output, _mm256_dp_pd(lhs.xyzw, rhs.xyzw, mask) );
685 // return output[0];
686
687 #if 1
688 __m128d lhsXY = _mm256_extractf128_pd( lhs.xyzw, 0 );
689 __m128d lhsZW = _mm256_extractf128_pd( lhs.xyzw, 1 );
690 __m128d rhsXY = _mm256_extractf128_pd( rhs.xyzw, 0 );
691 __m128d rhsZW = _mm256_extractf128_pd( rhs.xyzw, 1 );
692
693 const int mask = 0x31;
694 AGX_ALIGNED( agx::Real64, 16 ) output[2];
695
696 const __m128d xyDot = _mm_dp_pd(lhsXY, rhsXY, mask);
697 const __m128d zDot = _mm_mul_pd(lhsZW, rhsZW);
698
699 _mm_store_pd( output, _mm_add_pd(xyDot, zDot));
700 return output[0];
701
702 #else
703 __m256d a = _mm256_mul_pd( lhs.xyzw, rhs.xyzw );
704 __m256d b = _mm256_hadd_pd( a, a );
705 __m128d hi128 = _mm256_extractf128_pd( b, 1 );
706 __m128d dotproduct = _mm_add_pd( (__m128d)b, hi128 );
707
708 AGX_ALIGNED( agx::Real64, 16 ) output[2];
709 _mm_store_pd( output, dotproduct );
710 return output[0];
711 #endif
712
713#elif AGX_USE_SSE4()
714 const int mask = 0x31;
715 AGX_ALIGNED( agx::Real64, 16 ) output[2];
716
717 const __m128d xyDot = _mm_dp_pd(lhs.xy, rhs.xy, mask);
718 const __m128d zDot = _mm_mul_pd(lhs.zw, rhs.zw);
719
720 _mm_store_pd( output, _mm_add_pd(xyDot, zDot));
721 return output[0];
722#elif AGX_USE_SSE()
723 AGX_ALIGNED( agx::Real64, 16 ) scalarLhs[4];
724 AGX_ALIGNED( agx::Real64, 16 ) scalarRhs[4];
725
726 _mm_store_pd( scalarLhs, lhs.xy );
727 _mm_store_pd( scalarLhs + 2, lhs.zw );
728 _mm_store_pd( scalarRhs, rhs.xy );
729 _mm_store_pd( scalarRhs + 2, rhs.zw );
730
731 return scalarLhs[0] * scalarRhs[0] + scalarLhs[1] * scalarRhs[1] + scalarLhs[2] * scalarRhs[2];
732#else
733 return lhs.xyzw * rhs.xyzw;
734#endif
735 }
736
737 AGX_FORCE_INLINE agx::Real64 Vec3T<agx::Real64>::length2() const
738 {
739 return dot(*this, *this);
740 }
741
742 AGX_FORCE_INLINE agx::Real64 Vec3T<agx::Real64>::length() const
743 {
744#if AGX_USE_AVX() && 0
745 // NOTE: No _mm256_dp_pd instruction available!
746 AGX_ALIGNED( agx::Real64, 32 ) output[4];
747 _mm256_store_pd(output, _mm256_sqrt_pd(_mm256_dp_pd(xyzw, xyzw, 0x71)));
748 return output[0];
749#elif AGX_USE_SSE()
750 return std::sqrt(this->length2());
751#else
752 return (agx::Real64)xyzw.length();
753#endif
754 }
755
756
757 AGX_FORCE_INLINE agx::Real64 Vec3T<agx::Real64>::innerProduct(const Vec3T& v01, const Vec3T& v02, const Vec3T& v11, const Vec3T& v12)
758 {
759#if AGX_USE_AVX()
760
761 AGX_ALIGNED( agx::Real64, 32 ) output[4];
762
763 __m256d tmp1 = _mm256_mul_pd( v01.xyzw, v11.xyzw );
764 __m256d tmp2 = _mm256_mul_pd( v02.xyzw, v12.xyzw );
765
766 tmp1 = _mm256_add_pd( tmp1, tmp2 );
767
768 _mm256_store_pd( output, tmp1 );
769
770 return output[0]+output[1]+output[2];
771
772#elif AGX_USE_SSE()
773
774 // A0 B0 A0 * B0
775 // A1 B1 A1 * B1 A0 * B0 + A2 * B2
776 // --------- mul => ------- add => ----------------- hadd => A0 * B0 + A2 * B2 + A1 * B1 + A3 * B3
777 // A2 B2 A2 * B2 A1 * B1 + A3 * B3
778 // A3 B3 A3 * B3
779
780 __m128d tmp1 = _mm_mul_pd( v01.xy, v11.xy );
781 __m128d tmp2 = _mm_mul_pd( v01.zw, v11.zw );
782
783 __m128d tmp3 = _mm_mul_pd( v02.xy, v12.xy );
784 __m128d tmp4 = _mm_mul_pd( v02.zw, v12.zw );
785
786
787 tmp1 = _mm_add_pd( tmp1, tmp2 );
788 tmp3 = _mm_add_pd( tmp3, tmp4 );
789
790 AGX_ALIGNED( agx::Real64, 16 ) output[2];
791
792 tmp1 = _mm_hadd_pd( tmp1, tmp3 );
793 _mm_store_pd( output, tmp1 );
794
795 return output[0] + output[1];
796#else
797 return v01.xyzw * v11.xyzw + v02.xyzw * v12.xyzw;
798#endif
799 }
800
802
803 AGX_FORCE_INLINE std::ostream& operator << ( std::ostream& output, const Vec3d& v )
804 {
805 AGX_ALIGNED(agx::Vec3d, 32) tmp;
806 v.store(tmp);
807 return output << tmp;
808 }
809
810 AGX_FORCE_INLINE std::ostream& operator << ( std::ostream& output, const Vec3f& v )
811 {
812 AGX_ALIGNED(agx::Vec3f, 16) tmp;
813 v.store(tmp);
814 return output << tmp;
815 }
816
817}
819
820#endif /* AGX_SIMD_VEC3_H */
std::ostream & operator<<(std::ostream &o, const agx::Vec6 &v)
Definition: Vec6.h:230
static Vec3T mul(const Vec3T &lhs, const Vec3T &rhs)
Element-wise-multiplication.
Definition: Vec3Template.h:569
#define agxAssert(expr)
Definition: debug.h:143
#define DOXYGEN_END_INTERNAL_BLOCK()
Definition: macros.h:89
#define DOXYGEN_START_INTERNAL_BLOCK()
Definition: macros.h:88
#define AGX_ALIGNED(t, a)
Definition: macros.h:64
#define AGX_FORCE_INLINE
Definition: macros.h:58
The agx namespace contains the dynamics/math part of the AGX Dynamics API.
double Real64
Definition: Real.h:45
T absolute(T v)
return the absolute value.
Definition: Math.h:291
uint32_t UInt32
Definition: Integer.h:32
Vec3T< Real > Vec3
The object holding 3 dimensional vectors and providing basic arithmetic.
Definition: agx/Vec3.h:36
Vec3T< Real64 > Vec3d
Definition: agx/Vec3.h:40
uint64_t UInt
Definition: Integer.h:27
float Real32
Definition: Real.h:44
Vec3T< Real32 > Vec3f
Definition: agx/Vec3.h:39
STL namespace.