12 namespace mjolnir::x86
29 template <FloatVectorRegister T_RegisterType>
44 template <FloatVectorRegister T_RegisterType>
45 [[nodiscard]]
inline auto element_sum(T_RegisterType src) noexcept -> ElementType<T_RegisterType>;
61 template <UST t_num_elements, FloatVectorRegister T_RegisterType>
62 [[nodiscard]]
inline auto element_sum_first_n(T_RegisterType src) noexcept -> ElementType<T_RegisterType>;
70 template <UST t_num_elements, DoublePrecisionVectorRegister T_RegisterType>
71 [[nodiscard]]
inline auto element_sum_first_n(T_RegisterType src) noexcept -> ElementType<T_RegisterType>;
74 template <UST t_num_elements, SinglePrecisionVectorRegister T_RegisterType>
75 [[nodiscard]]
inline auto element_sum_first_n(T_RegisterType src) noexcept -> ElementType<T_RegisterType>;
91 namespace mjolnir::x86
96 template <FloatVectorRegister T_RegisterType>
99 if constexpr (is_single_precision<T_RegisterType>)
101 T_RegisterType sum =
mm_add(src, permute<1, 0, 3, 2>(src));
102 sum =
mm_add(sum, permute<2, 3, 0, 1>(sum));
104 if constexpr (is_avx_register<T_RegisterType>)
111 T_RegisterType sum =
mm_add(src, permute<1, 0>(src));
113 if constexpr (is_avx_register<T_RegisterType>)
123 template <FloatVectorRegister T_RegisterType>
133 template <UST t_num_elements, FloatVectorRegister T_RegisterType>
136 constexpr
UST n_e = num_elements<T_RegisterType>;
138 static_assert(t_num_elements > 0,
"`t_num_elements` must be larger than 0.");
139 static_assert(t_num_elements <= n_e,
"`t_num_elements` must be less or equal to the number of register elements.");
141 return internal::element_sum_first_n<t_num_elements, T_RegisterType>(src);
152 template <UST t_num_elements, DoublePrecisionVectorRegister T_RegisterType>
153 [[nodiscard]]
inline auto element_sum_first_n(T_RegisterType src) noexcept -> ElementType<T_RegisterType>
155 if constexpr (t_num_elements == 1)
157 else if constexpr (t_num_elements ==
num_elements<T_RegisterType>)
161 T_RegisterType sum =
mm_add(src, permute<1, 0>(src));
163 if constexpr (t_num_elements == 3)
176 if constexpr (t_num_elements == 1)
178 else if constexpr (t_num_elements ==
num_elements<T_RegisterType>)
180 else if constexpr (t_num_elements == 7)
182 auto zero = mm_setzero<__m256>();
183 return element_sum(blend_at<t_num_elements>(src, zero));
187 T_RegisterType sum =
mm_add(src, permute<1, 0, 3, 2>(src));
189 if constexpr (t_num_elements == 3)
192 if constexpr (t_num_elements == 4 || t_num_elements == 5)
194 sum =
mm_add(sum, permute<2, 3, 0, 1>(sum));
196 if constexpr (t_num_elements == 5)
200 if constexpr (t_num_elements == 6)
203 sum =
mm_add(sum, permute<2, 3, 0, 1>(sum));
std::size_t UST
Unsigned integer type that is returned by sizeof operations.
Definition: fundamental_types.h:29
auto mm_cvt_float(T_RegisterType src) -> ElementType< T_RegisterType >
Return the first element of src.
Definition: intrinsics.h:739
concept SinglePrecisionVectorRegister
Concept for a x86 vector register that has single precision elements.
Definition: definitions.h:66
typename std::conditional_t< is_any_of< T_RegisterType, __m128d, __m256d >(), F64, F32 > ElementType
The element type of an x86 vector register that is based on floating-point types.
Definition: definitions.h:212
auto broadcast(T_RegisterType src) noexcept -> T_RegisterType
Broadcast a register element per lane selected by t_index.
Definition: permutation.h:562
auto broadcast_element_sum(T_RegisterType src) noexcept -> T_RegisterType
Calculate the sum of all elements of src, broadcast it into a new register and return the result.
Definition: element_summation.h:97
auto element_sum_first_n(T_RegisterType src) noexcept -> ElementType< T_RegisterType >
Return the sum of the first t_num_elements elements from src.
Definition: element_summation.h:134
constexpr UST num_elements
Number of register elements.
Definition: definitions.h:257
auto mm_add(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
Perform an element-wise addition of lhs and rhs and return the result.
Definition: intrinsics.h:544
auto element_sum(T_RegisterType src) noexcept -> ElementType< T_RegisterType >
Return the sum of all elements from src.
Definition: element_summation.h:124
auto swap_lanes(T_RegisterType src) noexcept -> T_RegisterType
Swap the lanes of an AVX register and return the result.
Definition: permutation.h:911
Contains generalized/template versions of the x86 intrinsics.
Contains functions to permute and blend the elements of vector registers.
Contains x86 vectorization specific constants, concepts and definitions.
This header includes the correct x86 header depending on the operation system.