17 namespace mjolnir::x86
38 template <UST t_shift, FloatVectorRegister T_RegisterType>
39 [[nodiscard]]
inline auto align_right([[maybe_unused]] T_RegisterType lhs, [[maybe_unused]] T_RegisterType rhs) noexcept
62 [[nodiscard]]
inline auto blend(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType;
81 template <UST t_index, FloatVectorRegister T_RegisterType>
82 [[nodiscard]]
inline auto blend_above(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType;
100 template <UST t_index, FloatVectorRegister T_RegisterType>
101 [[nodiscard]]
inline auto blend_at(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType;
120 template <UST t_index, FloatVectorRegister T_RegisterType>
121 [[nodiscard]]
inline auto blend_below(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType;
141 template <UST t_index_first, UST t_index_last, FloatVectorRegister T_RegisterType>
142 [[nodiscard]]
inline auto blend_from_to(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType;
161 template <UST t_index, FloatVectorRegister T_RegisterType>
162 [[nodiscard]]
inline auto broadcast(T_RegisterType src) noexcept -> T_RegisterType;
183 template <UST t_index_0, UST t_index_1, FloatAVXRegister T_RegisterType>
184 [[nodiscard]]
inline auto broadcast(T_RegisterType src) noexcept -> T_RegisterType;
203 template <UST t_index, FloatVectorRegister T_RegisterType>
225 template <UST t_index_0, UST t_index_1, FloatVectorRegister T_RegisterType>
226 inline void exchange(T_RegisterType& reg_0, T_RegisterType& reg_1) noexcept;
253 template <
UST t_index_src,
UST t_index_dst,
bool... t_set_zero>
254 inline auto insert(__m128 src, __m128 dst) noexcept -> __m128;
277 [[nodiscard]]
inline auto permute(T_RegisterType src) noexcept -> T_RegisterType;
316 template <UST t_lane_0, UST t_lane_1, FloatAVXRegister T_RegisterType>
317 [[nodiscard]]
inline auto permute_lanes(T_RegisterType src) noexcept -> T_RegisterType;
342 [[nodiscard]]
inline auto shuffle(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType;
368 template <UST t_src_0, UST t_lane_0, UST t_src_1, UST t_lane_1, FloatAVXRegister T_RegisterType>
369 [[nodiscard]]
inline auto shuffle_lanes(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType;
390 template <UST t_
idx_0, UST t_
idx_1, FloatVectorRegister T_RegisterType>
391 [[nodiscard]]
inline auto swap(T_RegisterType src) noexcept -> T_RegisterType;
405 template <FloatAVXRegister T_RegisterType>
406 [[nodiscard]]
inline auto swap_lanes(T_RegisterType src) noexcept -> T_RegisterType;
422 template <
bool t_swap_lanes, FloatAVXRegister T_RegisterType>
423 [[nodiscard]]
inline auto swap_lanes_if(T_RegisterType src) noexcept -> T_RegisterType;
436 namespace mjolnir::x86
440 template <UST t_shift, FloatVectorRegister T_RegisterType>
441 [[nodiscard]]
inline auto align_right([[maybe_unused]] T_RegisterType lhs, [[maybe_unused]] T_RegisterType rhs) noexcept
444 static_assert(t_shift <= num_lane_elements<T_RegisterType>,
"t_shift must be in the range [0, num_lane_elements].");
447 if constexpr (t_shift == 0)
449 else if constexpr (t_shift == num_lane_elements<T_RegisterType>)
455 if constexpr (num_lanes<T_RegisterType> == 1)
456 return mm_cast_if<T_RegisterType>(_mm_alignr_epi8(
mm_cast_fi(lhs),
mm_cast_fi(rhs), element_shift));
458 return mm_cast_if<T_RegisterType>(_mm256_alignr_epi8(
mm_cast_fi(lhs),
mm_cast_fi(rhs), element_shift));
466 [[nodiscard]]
inline auto blend(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
468 static_assert(
sizeof...(t_args) == num_elements<T_RegisterType>,
469 "Number of template parameters must be equal to the number of register elements.");
470 static_assert(pack_all_less<t_args...>(2),
"All template values must be in the range [0, 1]");
478 template <UST t_index, FloatVectorRegister T_RegisterType>
479 [[nodiscard]]
inline auto blend_above(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
481 static_assert(t_index < num_elements<T_RegisterType>,
"`t_index` exceeds register size.");
483 if constexpr (t_index == num_elements<T_RegisterType> - 1)
487 constexpr
auto get_mask = [](
UST index)
490 for (
UST i = index + 1; i < num_elements<T_RegisterType>; ++i)
495 return mm_blend<get_mask(t_index)>(src_0, src_1);
502 template <UST t_index, FloatVectorRegister T_RegisterType>
503 [[nodiscard]]
inline auto blend_at(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
505 static_assert(t_index < num_elements<T_RegisterType>,
"`t_index` exceeds register size.");
507 return mm_blend<(UST(1) << t_index)>(src_0, src_1);
513 template <UST t_index, FloatVectorRegister T_RegisterType>
514 [[nodiscard]]
inline auto blend_below(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
516 static_assert(t_index < num_elements<T_RegisterType>,
"`t_index` exceeds register size.");
518 if constexpr (t_index == 0)
522 constexpr
auto get_mask = [](
UST index)
525 for (
UST i = 0; i < index; ++i)
530 return mm_blend<get_mask(t_index)>(src_0, src_1);
537 template <UST t_index_first, UST t_index_last, FloatVectorRegister T_RegisterType>
538 [[nodiscard]]
inline auto blend_from_to(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
540 static_assert(t_index_first <= t_index_last,
"`t_index_first` is larger than `t_index_last`.");
541 static_assert(t_index_last < num_elements<T_RegisterType>,
"`t_index_last` exceeds register size.");
543 if constexpr (t_index_first == 0 && t_index_last == num_elements<T_RegisterType> - 1)
547 constexpr
auto get_mask = [](
UST idx_first,
UST idx_last)
550 for (
UST i = idx_first; i <= idx_last; ++i)
555 return mm_blend<get_mask(t_index_first, t_index_last)>(src_0, src_1);
561 template <UST t_index, FloatVectorRegister T_RegisterType>
562 [[nodiscard]]
inline auto broadcast(T_RegisterType src) noexcept -> T_RegisterType
564 static_assert(t_index < num_lane_elements<T_RegisterType>,
"t_index exceeds lane size.");
566 if constexpr (t_index == 0 && is_sse_register<T_RegisterType>)
568 else if constexpr (is_double_precision<T_RegisterType>)
569 return permute<t_index, t_index>(src);
571 return permute<t_index, t_index, t_index, t_index>(src);
577 template <UST t_index_0, UST t_index_1, FloatAVXRegister T_RegisterType>
578 [[nodiscard]]
inline auto broadcast(T_RegisterType src) noexcept -> T_RegisterType
580 constexpr
UST n_le = num_lane_elements<T_RegisterType>;
582 static_assert(t_index_0 < n_le && t_index_1 < n_le,
"Indices may not exceed lane size.");
584 if constexpr (is_m256d<T_RegisterType>)
585 return permute<t_index_0, t_index_0, t_index_1, t_index_1>(src);
587 return permute<t_index_0, t_index_0, t_index_0, t_index_0, t_index_1, t_index_1, t_index_1, t_index_1>(src);
594 template <UST t_index, FloatVectorRegister T_RegisterType>
597 static_assert(t_index < num_elements<T_RegisterType>,
"Index exceeds register size.");
599 if constexpr (t_index == 0)
601 else if constexpr (is_sse_register<T_RegisterType>)
602 return broadcast<t_index>(src);
605 constexpr
UST idx_value = t_index % num_lane_elements<T_RegisterType>;
606 constexpr
UST idx_lane = t_index / num_lane_elements<T_RegisterType>;
608 return permute_lanes<idx_lane, idx_lane>(broadcast<idx_value>(src));
619 template <UST t_
idx_0, UST t_
idx_1, FloatVectorRegister T_RegisterType>
620 [[nodiscard]]
inline auto exchange_same_lane(T_RegisterType& reg_0, T_RegisterType& reg_1) noexcept
622 constexpr
UST n_le = num_lane_elements<T_RegisterType>;
624 T_RegisterType tmp_0 = reg_0;
625 T_RegisterType tmp_1 = reg_1;
627 if constexpr (t_idx_0 != t_idx_1)
629 tmp_0 = broadcast<t_idx_0 % n_le>(tmp_0);
630 tmp_1 = broadcast<t_idx_1 % n_le>(tmp_1);
633 reg_0 = blend_at<t_idx_0>(reg_0, tmp_1);
634 reg_1 = blend_at<t_idx_1>(reg_1, tmp_0);
642 template <UST t_
idx_0, UST t_
idx_1, FloatVectorRegister T_RegisterType>
643 [[nodiscard]]
inline auto exchange_different_lane(T_RegisterType& reg_0, T_RegisterType& reg_1) noexcept
645 constexpr
UST n_le = num_lane_elements<T_RegisterType>;
646 constexpr
UST lane_idx_0 = t_idx_0 / n_le;
647 constexpr
UST lane_idx_1 = t_idx_1 / n_le;
649 constexpr
UST select_reg_0 = (lane_idx_0 == 0) ? 1 : 0;
650 constexpr
UST select_reg_1 = (lane_idx_1 == 0) ? 1 : 0;
651 constexpr
U32 element_idx_0 = (lane_idx_0 == 0) ? t_idx_1 % n_le : t_idx_0 % n_le;
652 constexpr
U32 element_idx_1 = (lane_idx_0 == 0) ? t_idx_0 % n_le : t_idx_1 % n_le;
655 T_RegisterType tmp_0 = shuffle_lanes<select_reg_0, 1, select_reg_1, 0>(reg_0, reg_1);
657 if constexpr (element_idx_0 != element_idx_1)
658 tmp_0 = broadcast<element_idx_0, element_idx_1>(tmp_0);
660 reg_0 = blend_at<t_idx_0>(reg_0, tmp_0);
661 reg_1 = blend_at<t_idx_1>(reg_1, tmp_0);
670 template <UST t_index_0, UST t_index_1, FloatVectorRegister T_RegisterType>
671 inline void exchange(T_RegisterType& reg_0, T_RegisterType& reg_1) noexcept
673 constexpr
UST n_e = num_elements<T_RegisterType>;
674 constexpr
UST n_le = num_lane_elements<T_RegisterType>;
675 constexpr
UST lane_idx_0 = t_index_0 / n_le;
676 constexpr
UST lane_idx_1 = t_index_1 / n_le;
678 static_assert(t_index_0 < n_e && t_index_1 < n_e,
"Indices exceed the register size.");
681 if constexpr (lane_idx_0 == lane_idx_1)
682 internal::exchange_same_lane<t_index_0, t_index_1>(reg_0, reg_1);
684 internal::exchange_different_lane<t_index_0, t_index_1>(reg_0, reg_1);
690 template <
UST t_index_src,
UST t_index_dst,
bool... t_set_zero>
691 inline auto insert(__m128 src, __m128 dst) noexcept -> __m128
693 constexpr
UST n_e = num_elements<__m128>;
694 static_assert(t_index_src < n_e && t_index_dst < n_e,
"Indices exceed the register size.");
697 constexpr
UST selection_mask = bit_construct_from_ints<2, UST, t_index_src, t_index_dst>();
698 constexpr
UST mask = bit_construct_from_ints<4, UST, selection_mask, set_zero_mask>();
700 return _mm_insert_ps(dst, src, mask);
707 [[nodiscard]]
inline auto permute(T_RegisterType src) noexcept -> T_RegisterType
709 constexpr
UST n_e = num_elements<T_RegisterType>;
710 constexpr
UST n_le = num_lane_elements<T_RegisterType>;
712 static_assert(
sizeof...(t_indices) == n_le || (is_avx_register<T_RegisterType> &&
sizeof...(t_indices) == n_e),
713 "Number of indices must be identical to the number of elements or the number of lane elements.");
714 static_assert(pack_all_less<t_indices...>(n_le),
715 "All index values must be in the range [0, number of lane elements]");
717 if constexpr (is_m256d<T_RegisterType> &&
sizeof...(t_indices) == n_le)
718 return permute<t_indices..., t_indices...>(src);
719 else if constexpr (is_m256<T_RegisterType> &&
sizeof...(t_indices) == n_e)
720 return _mm256_permutevar_ps(src, _mm256_setr_epi32(t_indices...));
723 constexpr
UST num_index_bits = num_lane_elements<T_RegisterType> / 2;
734 constexpr
UST n_e = num_elements<T_RegisterType>;
736 static_assert(
sizeof...(t_indices) == n_e,
"Number of indices must be equal to the number of register elements.");
737 static_assert(pack_all_less<t_indices...>(n_e),
738 "All template values must be in the range [0, number of register elements]");
740 if constexpr (num_lanes<T_RegisterType> == 1)
741 return permute<t_indices...>(src);
742 else if constexpr (is_m256d<T_RegisterType>)
745 return _mm256_permute4x64_pd(src, mask);
749 const __m256i mask = _mm256_setr_epi32(t_indices...);
750 return _mm256_permutevar8x32_ps(src, mask);
757 template <UST t_lane_0, UST t_lane_1, FloatAVXRegister T_RegisterType>
758 [[nodiscard]]
inline auto permute_lanes(T_RegisterType src) noexcept -> T_RegisterType
760 return shuffle_lanes<0, t_lane_0, 0, t_lane_1>(src, src);
767 [[nodiscard]]
inline auto shuffle(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
769 constexpr
UST n_e = num_elements<T_RegisterType>;
770 constexpr
UST n_le = num_lane_elements<T_RegisterType>;
772 static_assert(
sizeof...(t_indices) == n_le || (is_m256d<T_RegisterType> &&
sizeof...(t_indices) == n_e),
773 "Number of indices must be identical to the number of lane elements (or elements for __m256d).");
774 static_assert(pack_all_less<t_indices...>(n_le),
775 "All index values must be in the range [0, number of lane elements]");
777 constexpr
auto get_mask = []() ->
UST
779 if constexpr (is_single_precision<T_RegisterType>)
781 else if constexpr (
sizeof...(t_indices) == num_elements<T_RegisterType>)
787 return mm_shuffle<get_mask()>(src_0, src_1);
793 template <UST t_src_0, UST t_lane_0, UST t_src_1, UST t_lane_1, FloatAVXRegister T_RegisterType>
794 [[nodiscard]]
inline auto shuffle_lanes(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
796 static_assert(pack_all_less<t_src_0, t_lane_0, t_src_1, t_lane_1>(2),
797 "All template values must be in the range [0, 1]");
799 constexpr
UST sel_0 = bit_construct<UST, t_src_0, t_lane_0>();
800 constexpr
UST sel_1 = bit_construct<UST, t_src_1, t_lane_1>();
801 constexpr
UST mask = (sel_1 << 4U) | sel_0;
803 return mm_permute2f128<mask>(src_0, src_1);
813 template <UST t_
idx_0, UST t_
idx_1, FloatVectorRegister T_RegisterType>
814 [[nodiscard]]
inline auto swap_same_lane(T_RegisterType src) noexcept -> T_RegisterType
816 constexpr
UST n_e = num_elements<T_RegisterType>;
818 constexpr
auto get_permute_index_array = []() constexpr
820 constexpr
UST n_le = num_lane_elements<T_RegisterType>;
821 std::array<UST, n_e> a = {{0}};
823 for (UST i = 0; i < n_e; ++i)
826 a[i] = t_idx_1 % n_le;
827 else if (t_idx_1 == i)
828 a[i] = t_idx_0 % n_le;
834 constexpr
auto p = get_permute_index_array();
837 if constexpr (n_e == 2)
838 return
permute<p[0], p[1]>(src);
839 else if constexpr (n_e == 4)
840 return
permute<p[0], p[1], p[2], p[3]>(src);
843 return
permute<p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]>(src);
851 [[nodiscard]] inline auto swap_different_lane(T_RegisterType src) noexcept -> T_RegisterType
853 constexpr
UST n_e = num_elements<T_RegisterType>;
854 constexpr
UST n_le = num_lane_elements<T_RegisterType>;
856 constexpr
U32 idx_lane_0 = (t_idx_0 < t_idx_1) ? t_idx_0 : t_idx_1;
857 constexpr
U32 idx_lane_1 = ((t_idx_0 > t_idx_1) ? t_idx_0 : t_idx_1) % n_le;
859 auto get_blend_index_array = []() constexpr->std::array<UST, n_e>
861 std::array<UST, n_e> a = {{0}};
862 for (UST i = 0; i < n_le; ++i)
864 a[i] = (idx_lane_0 == i) ? 1 : 0;
865 a[i + n_le] = (idx_lane_1 == i) ? 1 : 0;
869 constexpr
auto b = get_blend_index_array();
872 T_RegisterType bc = broadcast<idx_lane_0, idx_lane_1>(src);
874 if constexpr (n_e == 4)
875 return
blend<b[0], b[1], b[2], b[3]>(src, tmp);
878 return
blend<b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]>(src, tmp);
888 [[nodiscard]] inline auto
swap(T_RegisterType src) noexcept -> T_RegisterType
890 constexpr
UST n_e = num_elements<T_RegisterType>;
891 static_assert(t_idx_0 < n_e && t_idx_1 < n_e,
"Indices must be smaller than the number of register elements.");
893 if constexpr (t_idx_0 == t_idx_1)
897 constexpr
UST n_le = num_lane_elements<T_RegisterType>;
898 constexpr
UST lane_0 = t_idx_0 / n_le;
899 constexpr
UST lane_1 = t_idx_1 / n_le;
901 if constexpr (lane_0 == lane_1)
902 return internal::swap_same_lane<t_idx_0, t_idx_1>(src);
904 return internal::swap_different_lane<t_idx_0, t_idx_1>(src);
910 template <FloatAVXRegister T_RegisterType>
911 [[nodiscard]]
inline auto swap_lanes(T_RegisterType src) noexcept -> T_RegisterType
913 return permute_lanes<1, 0>(src);
919 template <
bool t_swap_lanes, FloatAVXRegister T_RegisterType>
920 [[nodiscard]]
inline auto swap_lanes_if(T_RegisterType src) noexcept -> T_RegisterType
922 if constexpr (t_swap_lanes)
Contains utility functions for bit related operations like setting and reading specific bits.
Defines the fundamental data types.
std::uint32_t U32
32 bit unsigned integer type
Definition: fundamental_types.h:27
std::size_t UST
Unsigned integer type that is returned by sizeof operations.
Definition: fundamental_types.h:29
std::uint8_t U8
8 bit unsigned integer type
Definition: fundamental_types.h:25
consteval auto bit_construct([[maybe_unused]] bool left_is_low=false) noexcept -> T_Type
Construct an unsigned integer by setting its individual bits.
Definition: bit_operations.h:459
constexpr void set_bit(T_Type &integer, UST index) noexcept
Set a single specific bit of an unsigned integer.
Definition: bit_operations.h:708
consteval auto bit_construct_from_ints(bool left_is_low=false) noexcept -> T_Type
Construct an unsigned integer from the bit patterns of multiple integer values.
Definition: bit_operations.h:499
auto shuffle_lanes(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
Create a new AVX register by combining arbitrary lanes from two source registers.
Definition: permutation.h:794
auto mm_broadcast(T_RegisterType src) noexcept -> T_RegisterType
Broadcasts the lowest floating point element across lanes to all elements of the returned register.
Definition: intrinsics.h:608
auto blend_from_to(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
Get a register where elements inside the specified index range are taken from src_1 and the rest from...
Definition: permutation.h:538
auto permute_across_lanes(T_RegisterType src) noexcept -> T_RegisterType
Shuffle the elements of a vector register across lanes using indices and return the result in a new r...
Definition: permutation.h:732
auto blend_below(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
Get a register where elements with a lower index than t_index are copied from src_1and the rest from ...
Definition: permutation.h:514
typename std::conditional_t< is_any_of< T_RegisterType, __m128d, __m256d >(), F64, F32 > ElementType
The element type of an x86 vector register that is based on floating-point types.
Definition: definitions.h:212
auto shuffle(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
Return a register with the first half of the lane elements selected from src_0 and the second half fr...
Definition: permutation.h:767
auto broadcast_across_lanes(T_RegisterType src) noexcept -> T_RegisterType
Broadcast a register element selected by t_index across lane boundaries.
Definition: permutation.h:595
auto mm_blend(T_RegisterType a, T_RegisterType b) noexcept -> T_RegisterType
Blend elements from a and b using a control mask and return the resulting vector register.
Definition: intrinsics.h:592
auto blend_above(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
Get a register where elements with a higher index than t_index are copied from src_1and the rest from...
Definition: permutation.h:479
auto permute(T_RegisterType src) noexcept -> T_RegisterType
Shuffle the elements of a vector register within lanes using indices and return the result in a new r...
Definition: permutation.h:707
concept FloatVectorRegister
Concept for a x86 vector register that has floating-point elements.
Definition: definitions.h:39
auto swap(T_RegisterType src) noexcept -> T_RegisterType
Swap two elements of a register and return the result.
Definition: permutation.h:888
auto permute_lanes(T_RegisterType src) noexcept -> T_RegisterType
Create a new AVX register by an arbitrary combination of the source registers lanes.
Definition: permutation.h:758
auto broadcast(T_RegisterType src) noexcept -> T_RegisterType
Broadcast a register element per lane selected by t_index_0 and t_index_1.
Definition: permutation.h:578
auto swap_lanes_if(T_RegisterType src) noexcept -> T_RegisterType
Return a new register with or without swapped lanes depending on the value of the boolean template pa...
Definition: permutation.h:920
auto mm_cast_fi(T_RegisterTypeIn src) noexcept
Bit cast a floating-point vector register to an equally sized integer vector register.
Definition: intrinsics.h:627
void exchange(T_RegisterType ®_0, T_RegisterType ®_1) noexcept
Exchange two elements selected by indices between two registers.
Definition: permutation.h:671
auto align_right([[maybe_unused]] T_RegisterType lhs, [[maybe_unused]] T_RegisterType rhs) noexcept -> T_RegisterType
Concatenate two floating-point registers, shift the result right by t_shift elements,...
Definition: permutation.h:441
auto blend_at(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
Get a new register where the element with index t_index is taken from src_1 and the rest from src_0
Definition: permutation.h:503
auto blend(T_RegisterType src_0, T_RegisterType src_1) noexcept -> T_RegisterType
Blend elements from src_0 and src_1 into a new register.
Definition: permutation.h:466
auto mm_permute(T_RegisterType src) noexcept -> T_RegisterType
Shuffle the elements in src using the control mask t_mask and return the resulting vector register.
Definition: intrinsics.h:849
auto insert(__m128 src, __m128 dst) noexcept -> __m128
Insert a single element from src into dst and return the result in a new __m128 register.
Definition: permutation.h:691
auto swap_lanes(T_RegisterType src) noexcept -> T_RegisterType
Swap the lanes of an AVX register and return the result.
Definition: permutation.h:911
Contains generalized/template versions of the x86 intrinsics.
Contains utility functions for parameter packs.
Contains x86 vectorization specific constants, concepts and definitions.