Mjolnir Core
Core functionality of the Mjolnir API
intrinsics.h
Go to the documentation of this file.
1 
7 
8 #pragma once
9 
10 
11 // === DECLARATIONS ===================================================================================================
12 
14 
15 #include <concepts>
16 
17 namespace mjolnir::x86
18 {
21 
22 
36 template <FloatVectorRegister T_RegisterType>
37 [[nodiscard]] inline auto mm_add(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType;
38 
39 
53 template <FloatVectorRegister T_RegisterType>
54 [[nodiscard]] inline auto mm_and(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType;
55 
56 
70 template <FloatVectorRegister T_RegisterType>
71 [[nodiscard]] inline auto mm_andnot(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType;
72 
73 
90 template <I32 t_mask, FloatVectorRegister T_RegisterType>
91 [[nodiscard]] inline auto mm_blend(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType;
92 
93 
112 template <FloatVectorRegister T_RegisterType>
113 [[nodiscard]] inline auto mm_broadcast(T_RegisterType src) noexcept -> T_RegisterType;
114 
115 
129 template <FloatVectorRegister T_RegisterTypeIn>
130 [[nodiscard]] inline auto mm_cast_fi(T_RegisterTypeIn src) noexcept;
131 
132 
146 template <FloatVectorRegister T_RegisterTypeOut, IntegerVectorRegister T_RegisterTypeIn>
147 [[nodiscard]] inline auto mm_cast_if(T_RegisterTypeIn src) noexcept -> T_RegisterTypeOut;
148 
149 
167 template <FloatVectorRegister T_RegisterType>
168 [[nodiscard]] inline auto mm_cmp_eq(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType;
169 
170 
188 template <FloatVectorRegister T_RegisterType>
189 [[nodiscard]] inline auto mm_cmp_ge(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType;
190 
191 
209 template <FloatVectorRegister T_RegisterType>
210 [[nodiscard]] inline auto mm_cmp_gt(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType;
211 
212 
230 template <FloatVectorRegister T_RegisterType>
231 [[nodiscard]] inline auto mm_cmp_le(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType;
232 
233 
251 template <FloatVectorRegister T_RegisterType>
252 [[nodiscard]] inline auto mm_cmp_lt(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType;
253 
254 
266 template <FloatVectorRegister T_RegisterType>
267 [[nodiscard]] inline auto mm_cvt_float(T_RegisterType src) -> ElementType<T_RegisterType>;
268 
269 
285 template <FloatVectorRegister T_RegisterType>
286 [[nodiscard]] inline auto mm_fmadd(T_RegisterType a, T_RegisterType b, T_RegisterType c) noexcept -> T_RegisterType;
287 
288 
304 template <FloatVectorRegister T_RegisterType>
305 [[nodiscard]] inline auto mm_fmsub(T_RegisterType a, T_RegisterType b, T_RegisterType c) noexcept -> T_RegisterType;
306 
307 
319 template <FloatVectorRegister T_RegisterType>
320 [[nodiscard]] inline auto mm_load(ElementType<T_RegisterType>* ptr) noexcept -> T_RegisterType;
321 
322 
337 template <IntegerVectorRegister T_RegisterType>
338 [[nodiscard]] inline auto mm_movemask_epi8(T_RegisterType src) noexcept;
339 
340 
354 template <FloatVectorRegister T_RegisterType>
355 [[nodiscard]] inline auto mm_mul(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType;
356 
357 
371 template <FloatVectorRegister T_RegisterType>
372 [[nodiscard]] inline auto mm_or(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType;
373 
374 
389 template <I32 t_mask, FloatVectorRegister T_RegisterType>
390 [[nodiscard]] inline auto mm_permute(T_RegisterType src) noexcept -> T_RegisterType;
391 
392 
410 template <I32 t_mask, FloatAVXRegister T_RegisterType>
411 [[nodiscard]] inline auto mm_permute2f128(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType;
412 
413 
425 template <FloatVectorRegister T_RegisterType>
426 [[nodiscard]] inline auto mm_set1(ElementType<T_RegisterType> value) noexcept -> T_RegisterType;
427 
428 
442 template <FloatVectorRegister T_RegisterType, typename... T_Args>
443 [[nodiscard]] inline auto mm_setr(T_Args... args) noexcept -> T_RegisterType;
444 
445 
454 template <FloatVectorRegister T_RegisterType>
455 [[nodiscard]] inline auto mm_setzero() noexcept -> T_RegisterType;
456 
457 
474 template <UST t_mask, FloatVectorRegister T_RegisterType>
475 [[nodiscard]] inline auto mm_shuffle(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType;
476 
477 
488 template <FloatVectorRegister T_RegisterType>
489 inline void mm_store(ElementType<T_RegisterType>* ptr, T_RegisterType reg) noexcept;
490 
491 
505 template <FloatVectorRegister T_RegisterType>
506 [[nodiscard]] inline auto mm_sub(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType;
507 
508 
522 template <FloatVectorRegister T_RegisterType>
523 [[nodiscard]] inline auto mm_xor(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType;
524 
525 
527 } // namespace mjolnir::x86
528 
529 
530 // === DEFINITIONS ====================================================================================================
531 
532 #include "mjolnir/core/utility/pointer_operations.h"
533 #include "mjolnir/core/x86/x86.h"
534 
535 #include <cassert>
536 #include <utility>
537 
538 
539 namespace mjolnir::x86
540 {
541 // --------------------------------------------------------------------------------------------------------------------
542 
543 template <FloatVectorRegister T_RegisterType>
544 [[nodiscard]] inline auto mm_add(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
545 {
546  if constexpr (is_m128<T_RegisterType>)
547  return _mm_add_ps(lhs, rhs); // NOLINT(portability-simd-intrinsics)
548  else if constexpr (is_m128d<T_RegisterType>)
549  return _mm_add_pd(lhs, rhs); // NOLINT(portability-simd-intrinsics)
550  else if constexpr (is_m256<T_RegisterType>)
551  return _mm256_add_ps(lhs, rhs); // NOLINT(portability-simd-intrinsics)
552  else
553  return _mm256_add_pd(lhs, rhs); // NOLINT(portability-simd-intrinsics)
554 }
555 
556 
557 // --------------------------------------------------------------------------------------------------------------------
558 
559 template <FloatVectorRegister T_RegisterType>
560 [[nodiscard]] inline auto mm_and(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType
561 {
562  if constexpr (is_m128<T_RegisterType>)
563  return _mm_and_ps(a, b);
564  else if constexpr (is_m128d<T_RegisterType>)
565  return _mm_and_pd(a, b);
566  else if constexpr (is_m256<T_RegisterType>)
567  return _mm256_and_ps(a, b);
568  else
569  return _mm256_and_pd(a, b);
570 }
571 
572 
573 // --------------------------------------------------------------------------------------------------------------------
574 
575 template <FloatVectorRegister T_RegisterType>
576 [[nodiscard]] inline auto mm_andnot(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType
577 {
578  if constexpr (is_m128<T_RegisterType>)
579  return _mm_andnot_ps(a, b);
580  else if constexpr (is_m128d<T_RegisterType>)
581  return _mm_andnot_pd(a, b);
582  else if constexpr (is_m256<T_RegisterType>)
583  return _mm256_andnot_ps(a, b);
584  else
585  return _mm256_andnot_pd(a, b);
586 }
587 
588 
589 // --------------------------------------------------------------------------------------------------------------------
590 
591 template <I32 t_mask, FloatVectorRegister T_RegisterType>
592 [[nodiscard]] inline auto mm_blend(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType
593 {
594  if constexpr (is_m128<T_RegisterType>)
595  return _mm_blend_ps(a, b, t_mask);
596  else if constexpr (is_m128d<T_RegisterType>)
597  return _mm_blend_pd(a, b, t_mask);
598  else if constexpr (is_m256<T_RegisterType>)
599  return _mm256_blend_ps(a, b, t_mask);
600  else
601  return _mm256_blend_pd(a, b, t_mask);
602 }
603 
604 
605 // --------------------------------------------------------------------------------------------------------------------
606 
607 template <FloatVectorRegister T_RegisterType>
608 [[nodiscard]] inline auto mm_broadcast(T_RegisterType src) noexcept -> T_RegisterType
609 {
610  if constexpr (is_m128<T_RegisterType>)
611  return _mm_broadcastss_ps(src);
612  else if constexpr (is_m128d<T_RegisterType>)
613  // The following command is currently missing in gcc - see https://stackoverflow.com/q/58270381/6700329
614  // Should be fixed in gcc 11 - see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95483
615  // return _mm_broadcastsd_pd(src);
616  return _mm_movedup_pd(src);
617  else if constexpr (is_m256<T_RegisterType>)
618  return _mm256_broadcastss_ps(_mm256_castps256_ps128(src));
619  else
620  return _mm256_broadcastsd_pd(_mm256_castpd256_pd128(src));
621 }
622 
623 
624 // --------------------------------------------------------------------------------------------------------------------
625 
626 template <FloatVectorRegister T_RegisterTypeIn>
627 [[nodiscard]] inline auto mm_cast_fi(T_RegisterTypeIn src) noexcept
628 {
629  if constexpr (is_m128<T_RegisterTypeIn>)
630  return _mm_castps_si128(src);
631  else if constexpr (is_m128d<T_RegisterTypeIn>)
632  return _mm_castpd_si128(src);
633  else if constexpr (is_m256<T_RegisterTypeIn>)
634  return _mm256_castps_si256(src);
635  else
636  return _mm256_castpd_si256(src);
637 }
638 
639 
640 // --------------------------------------------------------------------------------------------------------------------
641 
642 template <FloatVectorRegister T_RegisterTypeOut, IntegerVectorRegister T_RegisterTypeIn>
643 [[nodiscard]] inline auto mm_cast_if(T_RegisterTypeIn src) noexcept -> T_RegisterTypeOut
644 {
645  if constexpr (is_m128<T_RegisterTypeOut>)
646  return _mm_castsi128_ps(src);
647  else if constexpr (is_m128d<T_RegisterTypeOut>)
648  return _mm_castsi128_pd(src);
649  else if constexpr (is_m256<T_RegisterTypeOut>)
650  return _mm256_castsi256_ps(src);
651  else
652  return _mm256_castsi256_pd(src);
653 }
654 
655 
656 // --------------------------------------------------------------------------------------------------------------------
657 
658 template <FloatVectorRegister T_RegisterType>
659 [[nodiscard]] inline auto mm_cmp_eq(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
660 {
661  if constexpr (is_m128<T_RegisterType>)
662  return _mm_cmpeq_ps(lhs, rhs);
663  else if constexpr (is_m128d<T_RegisterType>)
664  return _mm_cmpeq_pd(lhs, rhs);
665  else if constexpr (is_m256<T_RegisterType>)
666  return _mm256_cmp_ps(lhs, rhs, _CMP_EQ_OS);
667  else
668  return _mm256_cmp_pd(lhs, rhs, _CMP_EQ_OS);
669 }
670 
671 
672 // --------------------------------------------------------------------------------------------------------------------
673 
674 template <FloatVectorRegister T_RegisterType>
675 [[nodiscard]] inline auto mm_cmp_ge(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
676 {
677  if constexpr (is_m128<T_RegisterType>)
678  return _mm_cmpge_ps(lhs, rhs);
679  else if constexpr (is_m128d<T_RegisterType>)
680  return _mm_cmpge_pd(lhs, rhs);
681  else if constexpr (is_m256<T_RegisterType>)
682  return _mm256_cmp_ps(lhs, rhs, _CMP_GE_OS);
683  else
684  return _mm256_cmp_pd(lhs, rhs, _CMP_GE_OS);
685 }
686 
687 
688 // --------------------------------------------------------------------------------------------------------------------
689 
690 template <FloatVectorRegister T_RegisterType>
691 [[nodiscard]] inline auto mm_cmp_gt(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
692 {
693  if constexpr (is_m128<T_RegisterType>)
694  return _mm_cmpgt_ps(lhs, rhs);
695  else if constexpr (is_m128d<T_RegisterType>)
696  return _mm_cmpgt_pd(lhs, rhs);
697  else if constexpr (is_m256<T_RegisterType>)
698  return _mm256_cmp_ps(lhs, rhs, _CMP_GT_OS);
699  else
700  return _mm256_cmp_pd(lhs, rhs, _CMP_GT_OS);
701 }
702 
703 
704 // --------------------------------------------------------------------------------------------------------------------
705 
706 template <FloatVectorRegister T_RegisterType>
707 [[nodiscard]] inline auto mm_cmp_le(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
708 {
709  if constexpr (is_m128<T_RegisterType>)
710  return _mm_cmple_ps(lhs, rhs);
711  else if constexpr (is_m128d<T_RegisterType>)
712  return _mm_cmple_pd(lhs, rhs);
713  else if constexpr (is_m256<T_RegisterType>)
714  return _mm256_cmp_ps(lhs, rhs, _CMP_LE_OS);
715  else
716  return _mm256_cmp_pd(lhs, rhs, _CMP_LE_OS);
717 }
718 
719 
720 // --------------------------------------------------------------------------------------------------------------------
721 
722 template <FloatVectorRegister T_RegisterType>
723 [[nodiscard]] inline auto mm_cmp_lt(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
724 {
725  if constexpr (is_m128<T_RegisterType>)
726  return _mm_cmplt_ps(lhs, rhs);
727  else if constexpr (is_m128d<T_RegisterType>)
728  return _mm_cmplt_pd(lhs, rhs);
729  else if constexpr (is_m256<T_RegisterType>)
730  return _mm256_cmp_ps(lhs, rhs, _CMP_LT_OS);
731  else
732  return _mm256_cmp_pd(lhs, rhs, _CMP_LT_OS);
733 }
734 
735 
736 // --------------------------------------------------------------------------------------------------------------------
737 
738 template <FloatVectorRegister T_RegisterType>
739 [[nodiscard]] inline auto mm_cvt_float(T_RegisterType src) -> ElementType<T_RegisterType>
740 {
741  if constexpr (is_m128<T_RegisterType>)
742  return _mm_cvtss_f32(src);
743  else if constexpr (is_m128d<T_RegisterType>)
744  return _mm_cvtsd_f64(src);
745  else if constexpr (is_m256<T_RegisterType>)
746  return _mm256_cvtss_f32(src);
747  else
748  return _mm256_cvtsd_f64(src);
749 }
750 
751 
752 // --------------------------------------------------------------------------------------------------------------------
753 
754 template <FloatVectorRegister T_RegisterType>
755 [[nodiscard]] inline auto mm_fmadd(T_RegisterType a, T_RegisterType b, T_RegisterType c) noexcept -> T_RegisterType
756 {
757  if constexpr (is_m128<T_RegisterType>)
758  return _mm_fmadd_ps(a, b, c);
759  else if constexpr (is_m128d<T_RegisterType>)
760  return _mm_fmadd_pd(a, b, c);
761  else if constexpr (is_m256<T_RegisterType>)
762  return _mm256_fmadd_ps(a, b, c);
763  else
764  return _mm256_fmadd_pd(a, b, c);
765 }
766 
767 
768 // --------------------------------------------------------------------------------------------------------------------
769 
770 template <FloatVectorRegister T_RegisterType>
771 [[nodiscard]] inline auto mm_fmsub(T_RegisterType a, T_RegisterType b, T_RegisterType c) noexcept -> T_RegisterType
772 {
773  if constexpr (is_m128<T_RegisterType>)
774  return _mm_fmsub_ps(a, b, c);
775  else if constexpr (is_m128d<T_RegisterType>)
776  return _mm_fmsub_pd(a, b, c);
777  else if constexpr (is_m256<T_RegisterType>)
778  return _mm256_fmsub_ps(a, b, c);
779  else
780  return _mm256_fmsub_pd(a, b, c);
781 }
782 
783 
784 // --------------------------------------------------------------------------------------------------------------------
785 
786 template <FloatVectorRegister T_RegisterType>
787 [[nodiscard]] inline auto mm_load(ElementType<T_RegisterType>* ptr) noexcept -> T_RegisterType
788 {
789  assert(is_aligned<alignment_bytes<T_RegisterType>>(ptr)); // NOLINT
790 
791  if constexpr (is_m128<T_RegisterType>)
792  return _mm_load_ps(ptr);
793  else if constexpr (is_m128d<T_RegisterType>)
794  return _mm_load_pd(ptr);
795  else if constexpr (is_m256<T_RegisterType>)
796  return _mm256_load_ps(ptr);
797  else
798  return _mm256_load_pd(ptr);
799 }
800 
801 
802 // --------------------------------------------------------------------------------------------------------------------
803 
804 template <IntegerVectorRegister T_RegisterType>
805 [[nodiscard]] inline auto mm_movemask_epi8(T_RegisterType src) noexcept
806 {
807  if constexpr (is_m128i<T_RegisterType>)
808  return static_cast<U16>(_mm_movemask_epi8(src));
809  else
810  return static_cast<U32>(_mm256_movemask_epi8(src));
811 }
812 
813 
814 // --------------------------------------------------------------------------------------------------------------------
815 
816 template <FloatVectorRegister T_RegisterType>
817 [[nodiscard]] inline auto mm_mul(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
818 {
819  if constexpr (is_m128<T_RegisterType>)
820  return _mm_mul_ps(lhs, rhs); // NOLINT(portability-simd-intrinsics)
821  else if constexpr (is_m128d<T_RegisterType>)
822  return _mm_mul_pd(lhs, rhs); // NOLINT(portability-simd-intrinsics)
823  else if constexpr (is_m256<T_RegisterType>)
824  return _mm256_mul_ps(lhs, rhs); // NOLINT(portability-simd-intrinsics)
825  else
826  return _mm256_mul_pd(lhs, rhs); // NOLINT(portability-simd-intrinsics)
827 }
828 
829 
830 // --------------------------------------------------------------------------------------------------------------------
831 
832 template <FloatVectorRegister T_RegisterType>
833 [[nodiscard]] inline auto mm_or(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType
834 {
835  if constexpr (is_m128<T_RegisterType>)
836  return _mm_or_ps(a, b);
837  else if constexpr (is_m128d<T_RegisterType>)
838  return _mm_or_pd(a, b);
839  else if constexpr (is_m256<T_RegisterType>)
840  return _mm256_or_ps(a, b);
841  else
842  return _mm256_or_pd(a, b);
843 }
844 
845 
846 // --------------------------------------------------------------------------------------------------------------------
847 
848 template <I32 t_mask, FloatVectorRegister T_RegisterType>
849 [[nodiscard]] inline auto mm_permute(T_RegisterType src) noexcept -> T_RegisterType
850 {
851  if constexpr (is_m128<T_RegisterType>)
852  return _mm_permute_ps(src, t_mask);
853  else if constexpr (is_m128d<T_RegisterType>)
854  return _mm_permute_pd(src, t_mask);
855  else if constexpr (is_m256<T_RegisterType>)
856  return _mm256_permute_ps(src, t_mask);
857  else
858  return _mm256_permute_pd(src, t_mask);
859 }
860 
861 
862 // --------------------------------------------------------------------------------------------------------------------
863 
864 template <I32 t_mask, FloatAVXRegister T_RegisterType>
865 [[nodiscard]] inline auto mm_permute2f128(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType
866 {
867  if constexpr (is_m256<T_RegisterType>)
868  return _mm256_permute2f128_ps(a, b, t_mask);
869  else
870  return _mm256_permute2f128_pd(a, b, t_mask);
871 }
872 
873 
874 // --------------------------------------------------------------------------------------------------------------------
875 
876 template <FloatVectorRegister T_RegisterType>
877 [[nodiscard]] inline auto mm_set1(ElementType<T_RegisterType> value) noexcept -> T_RegisterType
878 {
879  if constexpr (is_m128<T_RegisterType>)
880  return _mm_set1_ps(value);
881  else if constexpr (is_m128d<T_RegisterType>)
882  return _mm_set1_pd(value);
883  else if constexpr (is_m256<T_RegisterType>)
884  return _mm256_set1_ps(value);
885  else
886  return _mm256_set1_pd(value);
887 }
888 
889 
890 // --------------------------------------------------------------------------------------------------------------------
891 
892 template <FloatVectorRegister T_RegisterType, typename... T_Args>
893 [[nodiscard]] inline auto mm_setr(T_Args... args) noexcept -> T_RegisterType
894 {
895  using EType = ElementType<T_RegisterType>;
896 
897  if constexpr (is_m128<T_RegisterType>)
898  return _mm_setr_ps(static_cast<EType>(args)...);
899  else if constexpr (is_m128d<T_RegisterType>)
900  return _mm_setr_pd(static_cast<EType>(args)...);
901  else if constexpr (is_m256<T_RegisterType>)
902  return _mm256_setr_ps(static_cast<EType>(args)...);
903  else
904  return _mm256_setr_pd(static_cast<EType>(args)...);
905 }
906 
907 
908 // --------------------------------------------------------------------------------------------------------------------
909 
910 template <FloatVectorRegister T_RegisterType>
911 [[nodiscard]] inline auto mm_setzero() noexcept -> T_RegisterType
912 {
913  if constexpr (is_m128<T_RegisterType>)
914  return _mm_setzero_ps();
915  else if constexpr (is_m128d<T_RegisterType>)
916  return _mm_setzero_pd();
917  else if constexpr (is_m256<T_RegisterType>)
918  return _mm256_setzero_ps();
919  else
920  return _mm256_setzero_pd();
921 }
922 
923 
924 // --------------------------------------------------------------------------------------------------------------------
925 
926 template <UST t_mask, FloatVectorRegister T_RegisterType>
927 [[nodiscard]] inline auto mm_shuffle(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType
928 {
929  if constexpr (is_m128<T_RegisterType>)
930  return _mm_shuffle_ps(a, b, t_mask);
931  else if constexpr (is_m128d<T_RegisterType>)
932  return _mm_shuffle_pd(a, b, t_mask);
933  else if constexpr (is_m256<T_RegisterType>)
934  return _mm256_shuffle_ps(a, b, t_mask);
935  else
936  return _mm256_shuffle_pd(a, b, t_mask);
937 }
938 
939 
940 // --------------------------------------------------------------------------------------------------------------------
941 
942 template <FloatVectorRegister T_RegisterType>
943 inline void mm_store(ElementType<T_RegisterType>* ptr, T_RegisterType reg) noexcept
944 {
945  assert(is_aligned<alignment_bytes<T_RegisterType>>(ptr)); // NOLINT
946 
947  if constexpr (is_m128<T_RegisterType>)
948  _mm_store_ps(ptr, reg);
949  else if constexpr (is_m128d<T_RegisterType>)
950  _mm_store_pd(ptr, reg);
951  else if constexpr (is_m256<T_RegisterType>)
952  _mm256_store_ps(ptr, reg);
953  else
954  _mm256_store_pd(ptr, reg);
955 }
956 
957 
958 // --------------------------------------------------------------------------------------------------------------------
959 
960 template <FloatVectorRegister T_RegisterType>
961 [[nodiscard]] inline auto mm_sub(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
962 {
963  if constexpr (is_m128<T_RegisterType>)
964  return _mm_sub_ps(lhs, rhs); // NOLINT(portability-simd-intrinsics)
965  else if constexpr (is_m128d<T_RegisterType>)
966  return _mm_sub_pd(lhs, rhs); // NOLINT(portability-simd-intrinsics)
967  else if constexpr (is_m256<T_RegisterType>)
968  return _mm256_sub_ps(lhs, rhs); // NOLINT(portability-simd-intrinsics)
969  else
970  return _mm256_sub_pd(lhs, rhs); // NOLINT(portability-simd-intrinsics)
971 }
972 
973 
974 // --------------------------------------------------------------------------------------------------------------------
975 
976 template <FloatVectorRegister T_RegisterType>
977 [[nodiscard]] inline auto mm_xor(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType
978 {
979  if constexpr (is_m128<T_RegisterType>)
980  return _mm_xor_ps(a, b);
981  else if constexpr (is_m128d<T_RegisterType>)
982  return _mm_xor_pd(a, b);
983  else if constexpr (is_m256<T_RegisterType>)
984  return _mm256_xor_ps(a, b);
985  else
986  return _mm256_xor_pd(a, b);
987 }
988 
989 
990 } // namespace mjolnir::x86
std::uint32_t U32
32 bit unsigned integer type
Definition: fundamental_types.h:27
std::uint16_t U16
16 bit unsigned integer type
Definition: fundamental_types.h:26
auto is_aligned(const volatile T_Type *pointer) noexcept -> bool
Check if a passed pointer is aligned.
Definition: pointer_operations.h:191
auto mm_cvt_float(T_RegisterType src) -> ElementType< T_RegisterType >
Return the first element of src.
Definition: intrinsics.h:739
auto mm_sub(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
Subtract rhs element-wise from rhs and return the result.
Definition: intrinsics.h:961
auto mm_setr(T_Args... args) noexcept -> T_RegisterType
Set register elements with the supplied values in reverse order.
Definition: intrinsics.h:893
auto mm_broadcast(T_RegisterType src) noexcept -> T_RegisterType
Broadcasts the lowest floating point element across lanes to all elements of the returned register.
Definition: intrinsics.h:608
auto mm_andnot(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType
Compute the bitwise NOT of all elements in a and then AND with b.
Definition: intrinsics.h:576
auto mm_cmp_le(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
Compare element-wise if the register elements of lhs are less equal than the ones in rhs.
Definition: intrinsics.h:707
auto mm_fmsub(T_RegisterType a, T_RegisterType b, T_RegisterType c) noexcept -> T_RegisterType
Perform an element-wise multiplication of a and b, subtract c and return the result.
Definition: intrinsics.h:771
auto mm_cast_if(T_RegisterTypeIn src) noexcept -> T_RegisterTypeOut
Bit cast an integer vector register to an equally sized floating-point vector register.
Definition: intrinsics.h:643
typename std::conditional_t< is_any_of< T_RegisterType, __m128d, __m256d >(), F64, F32 > ElementType
The element type of an x86 vector register that is based on floating-point types.
Definition: definitions.h:212
auto mm_and(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType
Compute the bitwise AND of a and b.
Definition: intrinsics.h:560
auto mm_fmadd(T_RegisterType a, T_RegisterType b, T_RegisterType c) noexcept -> T_RegisterType
Perform an element-wise multiplication of a and b, add c and return the result.
Definition: intrinsics.h:755
auto mm_blend(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType
Blend elements from a and b using a control mask and return the resulting vector register.
Definition: intrinsics.h:592
auto mm_xor(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType
Compute the bitwise XOR of a and b.
Definition: intrinsics.h:977
auto mm_shuffle(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType
Return a register with the first half of the lane elements selected from a and the second half from b...
Definition: intrinsics.h:927
void mm_store(ElementType< T_RegisterType > *ptr, T_RegisterType reg) noexcept
Store the content of a register to a memory address.
Definition: intrinsics.h:943
auto mm_cmp_ge(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
Compare element-wise if the register elements of lhs are greater equal than the ones in rhs.
Definition: intrinsics.h:675
concept FloatVectorRegister
Concept for a x86 vector register that has floating-point elements.
Definition: definitions.h:39
auto mm_cmp_lt(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
Compare element-wise if the register elements of lhs are less than the ones in rhs.
Definition: intrinsics.h:723
auto mm_cmp_gt(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
Compare element-wise if the register elements of lhs are greater than the ones in rhs.
Definition: intrinsics.h:691
auto mm_permute2f128(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType
Shuffle 128-bits lanes selected by t_mask from a and b, and return the results in a new register.
Definition: intrinsics.h:865
auto mm_mul(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
Perform an element-wise multiplication of lhs and rhs and return the result.
Definition: intrinsics.h:817
auto mm_cmp_eq(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
Compare the register elements in lhs and rhs for equality and return the result.
Definition: intrinsics.h:659
auto mm_movemask_epi8(T_RegisterType src) noexcept
Create mask from the most significant bit of each 8-bit element in src, and return the result as unsi...
Definition: intrinsics.h:805
auto mm_cast_fi(T_RegisterTypeIn src) noexcept
Bit cast a floating-point vector register to an equally sized integer vector register.
Definition: intrinsics.h:627
auto mm_add(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
Perform an element-wise addition of lhs and rhs and return the result.
Definition: intrinsics.h:544
auto mm_set1(ElementType< T_RegisterType > value) noexcept -> T_RegisterType
Broadcast a single value a to all elements of the register.
Definition: intrinsics.h:877
auto mm_setzero() noexcept -> T_RegisterType
Return a vector register with all elements set to zero.
Definition: intrinsics.h:911
auto mm_or(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType
Compute the bitwise OR of a and b.
Definition: intrinsics.h:833
auto mm_permute(T_RegisterType src) noexcept -> T_RegisterType
Shuffle the elements in src using the control mask t_mask and return the resulting vector register.
Definition: intrinsics.h:849
auto mm_load(ElementType< T_RegisterType > *ptr) noexcept -> T_RegisterType
Load data from an aligned memory location into a new register.
Definition: intrinsics.h:787
Contains x86 vectorization specific constants, concepts and definitions.
This header includes the correct x86 header depending on the operation system.