Mjolnir Core
Core functionality of the Mjolnir API
permutation.h
Go to the documentation of this file.
1 
7 
8 #pragma once
9 
10 
11 // === DECLARATIONS ===================================================================================================
12 
15 
16 
17 namespace mjolnir::x86
18 {
21 
22 
38 template <UST t_shift, FloatVectorRegister T_RegisterType>
39 [[nodiscard]] inline auto align_right([[maybe_unused]] T_RegisterType lhs, [[maybe_unused]] T_RegisterType rhs) noexcept
40  -> T_RegisterType;
41 
42 
61 template <UST... t_args, FloatVectorRegister T_RegisterType>
62 [[nodiscard]] inline auto blend(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType;
63 
64 
81 template <UST t_index, FloatVectorRegister T_RegisterType>
82 [[nodiscard]] inline auto blend_above(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType;
83 
84 
100 template <UST t_index, FloatVectorRegister T_RegisterType>
101 [[nodiscard]] inline auto blend_at(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType;
102 
103 
120 template <UST t_index, FloatVectorRegister T_RegisterType>
121 [[nodiscard]] inline auto blend_below(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType;
122 
123 
141 template <UST t_index_first, UST t_index_last, FloatVectorRegister T_RegisterType>
142 [[nodiscard]] inline auto blend_from_to(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType;
143 
144 
161 template <UST t_index, FloatVectorRegister T_RegisterType>
162 [[nodiscard]] inline auto broadcast(T_RegisterType src) noexcept -> T_RegisterType;
163 
164 
183 template <UST t_index_0, UST t_index_1, FloatAVXRegister T_RegisterType>
184 [[nodiscard]] inline auto broadcast(T_RegisterType src) noexcept -> T_RegisterType;
185 
186 
203 template <UST t_index, FloatVectorRegister T_RegisterType>
204 [[nodiscard]] inline auto broadcast_across_lanes(T_RegisterType src) noexcept -> T_RegisterType;
205 
206 
225 template <UST t_index_0, UST t_index_1, FloatVectorRegister T_RegisterType>
226 inline void exchange(T_RegisterType& reg_0, T_RegisterType& reg_1) noexcept;
227 
228 
253 template <UST t_index_src, UST t_index_dst, bool... t_set_zero>
254 inline auto insert(__m128 src, __m128 dst) noexcept -> __m128;
255 
256 
276 template <UST... t_indices, FloatVectorRegister T_RegisterType>
277 [[nodiscard]] inline auto permute(T_RegisterType src) noexcept -> T_RegisterType;
278 
279 
295 template <UST... t_indices, FloatVectorRegister T_RegisterType>
296 [[nodiscard]] inline auto permute_across_lanes(T_RegisterType src) noexcept -> T_RegisterType;
297 
298 
316 template <UST t_lane_0, UST t_lane_1, FloatAVXRegister T_RegisterType>
317 [[nodiscard]] inline auto permute_lanes(T_RegisterType src) noexcept -> T_RegisterType;
318 
319 
341 template <UST... t_indices, FloatVectorRegister T_RegisterType>
342 [[nodiscard]] inline auto shuffle(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType;
343 
344 
368 template <UST t_src_0, UST t_lane_0, UST t_src_1, UST t_lane_1, FloatAVXRegister T_RegisterType>
369 [[nodiscard]] inline auto shuffle_lanes(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType;
370 
371 
390 template <UST t_idx_0, UST t_idx_1, FloatVectorRegister T_RegisterType>
391 [[nodiscard]] inline auto swap(T_RegisterType src) noexcept -> T_RegisterType;
392 
393 
405 template <FloatAVXRegister T_RegisterType>
406 [[nodiscard]] inline auto swap_lanes(T_RegisterType src) noexcept -> T_RegisterType;
407 
408 
422 template <bool t_swap_lanes, FloatAVXRegister T_RegisterType>
423 [[nodiscard]] inline auto swap_lanes_if(T_RegisterType src) noexcept -> T_RegisterType;
424 
425 
427 } // namespace mjolnir::x86
428 
429 
430 // === DEFINITIONS ====================================================================================================
431 
435 
436 namespace mjolnir::x86
437 {
438 // --------------------------------------------------------------------------------------------------------------------
439 
440 template <UST t_shift, FloatVectorRegister T_RegisterType>
441 [[nodiscard]] inline auto align_right([[maybe_unused]] T_RegisterType lhs, [[maybe_unused]] T_RegisterType rhs) noexcept
442  -> T_RegisterType
443 {
444  static_assert(t_shift <= num_lane_elements<T_RegisterType>, "t_shift must be in the range [0, num_lane_elements].");
445 
446 
447  if constexpr (t_shift == 0)
448  return rhs;
449  else if constexpr (t_shift == num_lane_elements<T_RegisterType>)
450  return lhs;
451  else
452  {
453  constexpr UST element_shift = t_shift * sizeof(ElementType<T_RegisterType>);
454 
455  if constexpr (num_lanes<T_RegisterType> == 1)
456  return mm_cast_if<T_RegisterType>(_mm_alignr_epi8(mm_cast_fi(lhs), mm_cast_fi(rhs), element_shift));
457  else
458  return mm_cast_if<T_RegisterType>(_mm256_alignr_epi8(mm_cast_fi(lhs), mm_cast_fi(rhs), element_shift));
459  }
460 }
461 
462 
463 // --------------------------------------------------------------------------------------------------------------------
464 
465 template <UST... t_args, FloatVectorRegister T_RegisterType>
466 [[nodiscard]] inline auto blend(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
467 {
468  static_assert(sizeof...(t_args) == num_elements<T_RegisterType>,
469  "Number of template parameters must be equal to the number of register elements.");
470  static_assert(pack_all_less<t_args...>(2), "All template values must be in the range [0, 1]");
471 
472  return mm_blend<bit_construct<UST, t_args...>(true)>(src_0, src_1);
473 }
474 
475 
476 // --------------------------------------------------------------------------------------------------------------------
477 
478 template <UST t_index, FloatVectorRegister T_RegisterType>
479 [[nodiscard]] inline auto blend_above(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
480 {
481  static_assert(t_index < num_elements<T_RegisterType>, "`t_index` exceeds register size.");
482 
483  if constexpr (t_index == num_elements<T_RegisterType> - 1)
484  return src_0;
485  else
486  {
487  constexpr auto get_mask = [](UST index)
488  {
489  UST mask = 0;
490  for (UST i = index + 1; i < num_elements<T_RegisterType>; ++i)
491  set_bit(mask, i);
492  return mask;
493  };
494 
495  return mm_blend<get_mask(t_index)>(src_0, src_1);
496  }
497 }
498 
499 
500 // --------------------------------------------------------------------------------------------------------------------
501 
502 template <UST t_index, FloatVectorRegister T_RegisterType>
503 [[nodiscard]] inline auto blend_at(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
504 {
505  static_assert(t_index < num_elements<T_RegisterType>, "`t_index` exceeds register size.");
506 
507  return mm_blend<(UST(1) << t_index)>(src_0, src_1);
508 }
509 
510 
511 // --------------------------------------------------------------------------------------------------------------------
512 
513 template <UST t_index, FloatVectorRegister T_RegisterType>
514 [[nodiscard]] inline auto blend_below(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
515 {
516  static_assert(t_index < num_elements<T_RegisterType>, "`t_index` exceeds register size.");
517 
518  if constexpr (t_index == 0)
519  return src_0;
520  else
521  {
522  constexpr auto get_mask = [](UST index)
523  {
524  UST mask = 0;
525  for (UST i = 0; i < index; ++i)
526  set_bit(mask, i);
527  return mask;
528  };
529 
530  return mm_blend<get_mask(t_index)>(src_0, src_1);
531  }
532 }
533 
534 
535 // --------------------------------------------------------------------------------------------------------------------
536 
537 template <UST t_index_first, UST t_index_last, FloatVectorRegister T_RegisterType>
538 [[nodiscard]] inline auto blend_from_to(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
539 {
540  static_assert(t_index_first <= t_index_last, "`t_index_first` is larger than `t_index_last`.");
541  static_assert(t_index_last < num_elements<T_RegisterType>, "`t_index_last` exceeds register size.");
542 
543  if constexpr (t_index_first == 0 && t_index_last == num_elements<T_RegisterType> - 1)
544  return src_1;
545  else
546  {
547  constexpr auto get_mask = [](UST idx_first, UST idx_last)
548  {
549  UST mask = 0;
550  for (UST i = idx_first; i <= idx_last; ++i)
551  set_bit(mask, i);
552  return mask;
553  };
554 
555  return mm_blend<get_mask(t_index_first, t_index_last)>(src_0, src_1);
556  }
557 }
558 
559 // --------------------------------------------------------------------------------------------------------------------
560 
561 template <UST t_index, FloatVectorRegister T_RegisterType>
562 [[nodiscard]] inline auto broadcast(T_RegisterType src) noexcept -> T_RegisterType
563 {
564  static_assert(t_index < num_lane_elements<T_RegisterType>, "t_index exceeds lane size.");
565 
566  if constexpr (t_index == 0 && is_sse_register<T_RegisterType>)
567  return mm_broadcast(src);
568  else if constexpr (is_double_precision<T_RegisterType>)
569  return permute<t_index, t_index>(src);
570  else
571  return permute<t_index, t_index, t_index, t_index>(src);
572 }
573 
574 
575 // --------------------------------------------------------------------------------------------------------------------
576 
577 template <UST t_index_0, UST t_index_1, FloatAVXRegister T_RegisterType>
578 [[nodiscard]] inline auto broadcast(T_RegisterType src) noexcept -> T_RegisterType
579 {
580  constexpr UST n_le = num_lane_elements<T_RegisterType>;
581 
582  static_assert(t_index_0 < n_le && t_index_1 < n_le, "Indices may not exceed lane size.");
583 
584  if constexpr (is_m256d<T_RegisterType>)
585  return permute<t_index_0, t_index_0, t_index_1, t_index_1>(src);
586  else
587  return permute<t_index_0, t_index_0, t_index_0, t_index_0, t_index_1, t_index_1, t_index_1, t_index_1>(src);
588 }
589 
590 
591 // --------------------------------------------------------------------------------------------------------------------
592 
594 template <UST t_index, FloatVectorRegister T_RegisterType>
595 [[nodiscard]] inline auto broadcast_across_lanes(T_RegisterType src) noexcept -> T_RegisterType
596 {
597  static_assert(t_index < num_elements<T_RegisterType>, "Index exceeds register size.");
598 
599  if constexpr (t_index == 0)
600  return mm_broadcast(src);
601  else if constexpr (is_sse_register<T_RegisterType>)
602  return broadcast<t_index>(src);
603  else
604  {
605  constexpr UST idx_value = t_index % num_lane_elements<T_RegisterType>;
606  constexpr UST idx_lane = t_index / num_lane_elements<T_RegisterType>;
607 
608  return permute_lanes<idx_lane, idx_lane>(broadcast<idx_value>(src));
609  }
610 }
611 
612 
613 // --- internal functions of exchange ---------------------------------------------------------------------------------
614 
616 namespace internal
617 {
619 template <UST t_idx_0, UST t_idx_1, FloatVectorRegister T_RegisterType>
620 [[nodiscard]] inline auto exchange_same_lane(T_RegisterType& reg_0, T_RegisterType& reg_1) noexcept
621 {
622  constexpr UST n_le = num_lane_elements<T_RegisterType>;
623 
624  T_RegisterType tmp_0 = reg_0;
625  T_RegisterType tmp_1 = reg_1;
626 
627  if constexpr (t_idx_0 != t_idx_1)
628  {
629  tmp_0 = broadcast<t_idx_0 % n_le>(tmp_0);
630  tmp_1 = broadcast<t_idx_1 % n_le>(tmp_1);
631  }
632 
633  reg_0 = blend_at<t_idx_0>(reg_0, tmp_1); // NOLINT(readability-misleading-indentation) - clang-tidy bug
634  reg_1 = blend_at<t_idx_1>(reg_1, tmp_0);
635 }
636 
637 
638 // --------------------------------------------------------
639 
640 
642 template <UST t_idx_0, UST t_idx_1, FloatVectorRegister T_RegisterType>
643 [[nodiscard]] inline auto exchange_different_lane(T_RegisterType& reg_0, T_RegisterType& reg_1) noexcept
644 {
645  constexpr UST n_le = num_lane_elements<T_RegisterType>;
646  constexpr UST lane_idx_0 = t_idx_0 / n_le;
647  constexpr UST lane_idx_1 = t_idx_1 / n_le;
648 
649  constexpr UST select_reg_0 = (lane_idx_0 == 0) ? 1 : 0;
650  constexpr UST select_reg_1 = (lane_idx_1 == 0) ? 1 : 0;
651  constexpr U32 element_idx_0 = (lane_idx_0 == 0) ? t_idx_1 % n_le : t_idx_0 % n_le;
652  constexpr U32 element_idx_1 = (lane_idx_0 == 0) ? t_idx_0 % n_le : t_idx_1 % n_le;
653 
654 
655  T_RegisterType tmp_0 = shuffle_lanes<select_reg_0, 1, select_reg_1, 0>(reg_0, reg_1);
656 
657  if constexpr (element_idx_0 != element_idx_1)
658  tmp_0 = broadcast<element_idx_0, element_idx_1>(tmp_0);
659 
660  reg_0 = blend_at<t_idx_0>(reg_0, tmp_0);
661  reg_1 = blend_at<t_idx_1>(reg_1, tmp_0);
662 }
663 
664 } // namespace internal
666 
667 
668 // --------------------------------------------------------------------------------------------------------------------
669 
670 template <UST t_index_0, UST t_index_1, FloatVectorRegister T_RegisterType>
671 inline void exchange(T_RegisterType& reg_0, T_RegisterType& reg_1) noexcept
672 {
673  constexpr UST n_e = num_elements<T_RegisterType>;
674  constexpr UST n_le = num_lane_elements<T_RegisterType>;
675  constexpr UST lane_idx_0 = t_index_0 / n_le;
676  constexpr UST lane_idx_1 = t_index_1 / n_le;
677 
678  static_assert(t_index_0 < n_e && t_index_1 < n_e, "Indices exceed the register size.");
679 
680 
681  if constexpr (lane_idx_0 == lane_idx_1)
682  internal::exchange_same_lane<t_index_0, t_index_1>(reg_0, reg_1);
683  else
684  internal::exchange_different_lane<t_index_0, t_index_1>(reg_0, reg_1);
685 }
686 
687 
688 // --------------------------------------------------------------------------------------------------------------------
689 
690 template <UST t_index_src, UST t_index_dst, bool... t_set_zero>
691 inline auto insert(__m128 src, __m128 dst) noexcept -> __m128
692 {
693  constexpr UST n_e = num_elements<__m128>;
694  static_assert(t_index_src < n_e && t_index_dst < n_e, "Indices exceed the register size.");
695 
696  constexpr UST set_zero_mask = bit_construct<UST, t_set_zero...>(true);
697  constexpr UST selection_mask = bit_construct_from_ints<2, UST, t_index_src, t_index_dst>();
698  constexpr UST mask = bit_construct_from_ints<4, UST, selection_mask, set_zero_mask>();
699 
700  return _mm_insert_ps(dst, src, mask);
701 }
702 
703 
704 // --------------------------------------------------------------------------------------------------------------------
705 
706 template <UST... t_indices, FloatVectorRegister T_RegisterType>
707 [[nodiscard]] inline auto permute(T_RegisterType src) noexcept -> T_RegisterType
708 {
709  constexpr UST n_e = num_elements<T_RegisterType>;
710  constexpr UST n_le = num_lane_elements<T_RegisterType>;
711 
712  static_assert(sizeof...(t_indices) == n_le || (is_avx_register<T_RegisterType> && sizeof...(t_indices) == n_e),
713  "Number of indices must be identical to the number of elements or the number of lane elements.");
714  static_assert(pack_all_less<t_indices...>(n_le),
715  "All index values must be in the range [0, number of lane elements]");
716 
717  if constexpr (is_m256d<T_RegisterType> && sizeof...(t_indices) == n_le)
718  return permute<t_indices..., t_indices...>(src);
719  else if constexpr (is_m256<T_RegisterType> && sizeof...(t_indices) == n_e)
720  return _mm256_permutevar_ps(src, _mm256_setr_epi32(t_indices...));
721  else
722  {
723  constexpr UST num_index_bits = num_lane_elements<T_RegisterType> / 2;
724  return mm_permute<bit_construct_from_ints<num_index_bits, U8, t_indices...>(true)>(src);
725  }
726 }
727 
728 
729 // --------------------------------------------------------------------------------------------------------------------
730 
731 template <UST... t_indices, FloatVectorRegister T_RegisterType>
732 [[nodiscard]] inline auto permute_across_lanes(T_RegisterType src) noexcept -> T_RegisterType
733 {
734  constexpr UST n_e = num_elements<T_RegisterType>;
735 
736  static_assert(sizeof...(t_indices) == n_e, "Number of indices must be equal to the number of register elements.");
737  static_assert(pack_all_less<t_indices...>(n_e),
738  "All template values must be in the range [0, number of register elements]");
739 
740  if constexpr (num_lanes<T_RegisterType> == 1)
741  return permute<t_indices...>(src);
742  else if constexpr (is_m256d<T_RegisterType>)
743  {
744  constexpr UST mask = bit_construct_from_ints<2, UST, t_indices...>(true);
745  return _mm256_permute4x64_pd(src, mask);
746  }
747  else
748  {
749  const __m256i mask = _mm256_setr_epi32(t_indices...);
750  return _mm256_permutevar8x32_ps(src, mask);
751  }
752 }
753 
754 
755 // --------------------------------------------------------------------------------------------------------------------
756 
757 template <UST t_lane_0, UST t_lane_1, FloatAVXRegister T_RegisterType>
758 [[nodiscard]] inline auto permute_lanes(T_RegisterType src) noexcept -> T_RegisterType
759 {
760  return shuffle_lanes<0, t_lane_0, 0, t_lane_1>(src, src);
761 }
762 
763 
764 // --------------------------------------------------------------------------------------------------------------------
765 
766 template <UST... t_indices, FloatVectorRegister T_RegisterType>
767 [[nodiscard]] inline auto shuffle(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
768 {
769  constexpr UST n_e = num_elements<T_RegisterType>;
770  constexpr UST n_le = num_lane_elements<T_RegisterType>;
771 
772  static_assert(sizeof...(t_indices) == n_le || (is_m256d<T_RegisterType> && sizeof...(t_indices) == n_e),
773  "Number of indices must be identical to the number of lane elements (or elements for __m256d).");
774  static_assert(pack_all_less<t_indices...>(n_le),
775  "All index values must be in the range [0, number of lane elements]");
776 
777  constexpr auto get_mask = []() -> UST
778  {
779  if constexpr (is_single_precision<T_RegisterType>)
780  return bit_construct_from_ints<2, UST, t_indices...>(true);
781  else if constexpr (sizeof...(t_indices) == num_elements<T_RegisterType>)
782  return bit_construct<UST, t_indices...>(true);
783  else
784  return bit_construct<UST, t_indices..., t_indices...>(true);
785  };
786 
787  return mm_shuffle<get_mask()>(src_0, src_1);
788 }
789 
790 
791 // --------------------------------------------------------------------------------------------------------------------
792 
793 template <UST t_src_0, UST t_lane_0, UST t_src_1, UST t_lane_1, FloatAVXRegister T_RegisterType>
794 [[nodiscard]] inline auto shuffle_lanes(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
795 {
796  static_assert(pack_all_less<t_src_0, t_lane_0, t_src_1, t_lane_1>(2),
797  "All template values must be in the range [0, 1]");
798 
799  constexpr UST sel_0 = bit_construct<UST, t_src_0, t_lane_0>();
800  constexpr UST sel_1 = bit_construct<UST, t_src_1, t_lane_1>();
801  constexpr UST mask = (sel_1 << 4U) | sel_0;
802 
803  return mm_permute2f128<mask>(src_0, src_1);
804 }
805 
806 
807 // --- internal functions of swap -------------------------------------------------------------------------------------
808 
810 namespace internal
811 {
813 template <UST t_idx_0, UST t_idx_1, FloatVectorRegister T_RegisterType>
814 [[nodiscard]] inline auto swap_same_lane(T_RegisterType src) noexcept -> T_RegisterType
815 {
816  constexpr UST n_e = num_elements<T_RegisterType>;
817 
818  constexpr auto get_permute_index_array = []() constexpr
819  {
820  constexpr UST n_le = num_lane_elements<T_RegisterType>;
821  std::array<UST, n_e> a = {{0}};
822 
823  for (UST i = 0; i < n_e; ++i)
824  {
825  if (t_idx_0 == i)
826  a[i] = t_idx_1 % n_le;
827  else if (t_idx_1 == i)
828  a[i] = t_idx_0 % n_le;
829  else
830  a[i] = i % n_le;
831  }
832  return a;
833  };
834  constexpr auto p = get_permute_index_array();
835 
836 
837  if constexpr (n_e == 2)
838  return permute<p[0], p[1]>(src);
839  else if constexpr (n_e == 4)
840  return permute<p[0], p[1], p[2], p[3]>(src);
841  else
842  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
843  return permute<p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]>(src);
844 }
845 
846 
847 // --------------------------------------------------------
848 
850 template <UST t_idx_0, UST t_idx_1, FloatVectorRegister T_RegisterType>
851 [[nodiscard]] inline auto swap_different_lane(T_RegisterType src) noexcept -> T_RegisterType
852 {
853  constexpr UST n_e = num_elements<T_RegisterType>;
854  constexpr UST n_le = num_lane_elements<T_RegisterType>;
855 
856  constexpr U32 idx_lane_0 = (t_idx_0 < t_idx_1) ? t_idx_0 : t_idx_1;
857  constexpr U32 idx_lane_1 = ((t_idx_0 > t_idx_1) ? t_idx_0 : t_idx_1) % n_le;
858 
859  auto get_blend_index_array = []() constexpr->std::array<UST, n_e>
860  {
861  std::array<UST, n_e> a = {{0}};
862  for (UST i = 0; i < n_le; ++i)
863  {
864  a[i] = (idx_lane_0 == i) ? 1 : 0;
865  a[i + n_le] = (idx_lane_1 == i) ? 1 : 0;
866  }
867  return a;
868  };
869  constexpr auto b = get_blend_index_array();
870 
871 
872  T_RegisterType bc = broadcast<idx_lane_0, idx_lane_1>(src);
873  T_RegisterType tmp = swap_lanes(bc);
874  if constexpr (n_e == 4)
875  return blend<b[0], b[1], b[2], b[3]>(src, tmp);
876  else
877  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
878  return blend<b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]>(src, tmp);
879 }
880 
881 } // namespace internal
883 
884 
885 // --------------------------------------------------------------------------------------------------------------------
886 
887 template <UST t_idx_0, UST t_idx_1, FloatVectorRegister T_RegisterType>
888 [[nodiscard]] inline auto swap(T_RegisterType src) noexcept -> T_RegisterType
889 {
890  constexpr UST n_e = num_elements<T_RegisterType>;
891  static_assert(t_idx_0 < n_e && t_idx_1 < n_e, "Indices must be smaller than the number of register elements.");
892 
893  if constexpr (t_idx_0 == t_idx_1)
894  return src;
895  else
896  {
897  constexpr UST n_le = num_lane_elements<T_RegisterType>;
898  constexpr UST lane_0 = t_idx_0 / n_le;
899  constexpr UST lane_1 = t_idx_1 / n_le;
900 
901  if constexpr (lane_0 == lane_1)
902  return internal::swap_same_lane<t_idx_0, t_idx_1>(src);
903  else
904  return internal::swap_different_lane<t_idx_0, t_idx_1>(src);
905  }
906 }
907 
908 // --------------------------------------------------------------------------------------------------------------------
909 
910 template <FloatAVXRegister T_RegisterType>
911 [[nodiscard]] inline auto swap_lanes(T_RegisterType src) noexcept -> T_RegisterType
912 {
913  return permute_lanes<1, 0>(src);
914 }
915 
916 
917 // --------------------------------------------------------------------------------------------------------------------
918 
919 template <bool t_swap_lanes, FloatAVXRegister T_RegisterType>
920 [[nodiscard]] inline auto swap_lanes_if(T_RegisterType src) noexcept -> T_RegisterType
921 {
922  if constexpr (t_swap_lanes)
923  return swap_lanes(src);
924  else
925  return src;
926 }
927 
928 // --------------------------------------------------------------------------------------------------------------------
929 
930 } // namespace mjolnir::x86
Contains utility functions for bit related operations like setting and reading specific bits.
Defines the fundamental data types.
std::uint32_t U32
32 bit unsigned integer type
Definition: fundamental_types.h:27
std::size_t UST
Unsigned integer type that is returned by sizeof operations.
Definition: fundamental_types.h:29
std::uint8_t U8
8 bit unsigned integer type
Definition: fundamental_types.h:25
consteval auto bit_construct([[maybe_unused]] bool left_is_low=false) noexcept -> T_Type
Construct an unsigned integer by setting its individual bits.
Definition: bit_operations.h:459
constexpr void set_bit(T_Type &integer, UST index) noexcept
Set a single specific bit of an unsigned integer.
Definition: bit_operations.h:708
consteval auto bit_construct_from_ints(bool left_is_low=false) noexcept -> T_Type
Construct an unsigned integer from the bit patterns of multiple integer values.
Definition: bit_operations.h:499
auto shuffle_lanes(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
Create a new AVX register by combining arbitrary lanes from two source registers.
Definition: permutation.h:794
auto mm_broadcast(T_RegisterType src) noexcept -> T_RegisterType
Broadcasts the lowest floating point element across lanes to all elements of the returned register.
Definition: intrinsics.h:608
auto blend_from_to(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
Get a register where elements inside the specified index range are taken from src_1 and the rest from...
Definition: permutation.h:538
auto permute_across_lanes(T_RegisterType src) noexcept -> T_RegisterType
Shuffle the elements of a vector register across lanes using indices and return the result in a new r...
Definition: permutation.h:732
auto blend_below(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
Get a register where elements with a lower index than t_index are copied from src_1and the rest from ...
Definition: permutation.h:514
typename std::conditional_t< is_any_of< T_RegisterType, __m128d, __m256d >(), F64, F32 > ElementType
The element type of an x86 vector register that is based on floating-point types.
Definition: definitions.h:212
auto shuffle(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
Return a register with the first half of the lane elements selected from src_0 and the second half fr...
Definition: permutation.h:767
auto broadcast_across_lanes(T_RegisterType src) noexcept -> T_RegisterType
Broadcast a register element selected by t_index across lane boundaries.
Definition: permutation.h:595
auto mm_blend(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType
Blend elements from a and b using a control mask and return the resulting vector register.
Definition: intrinsics.h:592
auto blend_above(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
Get a register where elements with a higher index than t_index are copied from src_1and the rest from...
Definition: permutation.h:479
auto permute(T_RegisterType src) noexcept -> T_RegisterType
Shuffle the elements of a vector register within lanes using indices and return the result in a new r...
Definition: permutation.h:707
concept FloatVectorRegister
Concept for a x86 vector register that has floating-point elements.
Definition: definitions.h:39
auto swap(T_RegisterType src) noexcept -> T_RegisterType
Swap two elements of a register and return the result.
Definition: permutation.h:888
auto permute_lanes(T_RegisterType src) noexcept -> T_RegisterType
Create a new AVX register by an arbitrary combination of the source registers lanes.
Definition: permutation.h:758
auto broadcast(T_RegisterType src) noexcept -> T_RegisterType
Broadcast a register element per lane selected by t_index_0 and t_index_1.
Definition: permutation.h:578
auto swap_lanes_if(T_RegisterType src) noexcept -> T_RegisterType
Return a new register with or without swapped lanes depending on the value of the boolean template pa...
Definition: permutation.h:920
auto mm_cast_fi(T_RegisterTypeIn src) noexcept
Bit cast a floating-point vector register to an equally sized integer vector register.
Definition: intrinsics.h:627
void exchange(T_RegisterType &reg_0, T_RegisterType &reg_1) noexcept
Exchange two elements selected by indices between two registers.
Definition: permutation.h:671
auto align_right([[maybe_unused]] T_RegisterType lhs, [[maybe_unused]] T_RegisterType rhs) noexcept -> T_RegisterType
Concatenate two floating-point registers, shift the result right by t_shift elements,...
Definition: permutation.h:441
auto blend_at(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
Get a new register where the element with index t_index is taken from src_1 and the rest from src_0
Definition: permutation.h:503
auto blend(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
Blend elements from src_0 and src_1 into a new register.
Definition: permutation.h:466
auto mm_permute(T_RegisterType src) noexcept -> T_RegisterType
Shuffle the elements in src using the control mask t_mask and return the resulting vector register.
Definition: intrinsics.h:849
auto insert(__m128 src, __m128 dst) noexcept -> __m128
Insert a single element from src into dst and return the result in a new __m128 register.
Definition: permutation.h:691
auto swap_lanes(T_RegisterType src) noexcept -> T_RegisterType
Swap the lanes of an AVX register and return the result.
Definition: permutation.h:911
Contains generalized/template versions of the x86 intrinsics.
Contains utility functions for parameter packs.
Contains x86 vectorization specific constants, concepts and definitions.