en/latest/element__summation_8h_source.html

 #pragma once


 #include "mjolnir/core/x86/definitions.h"


 namespace mjolnir::x86

 {


 template <FloatVectorRegister T_RegisterType>

 [[nodiscard]] inline auto broadcast_element_sum(T_RegisterType src) noexcept -> T_RegisterType;


 template <FloatVectorRegister T_RegisterType>

 [[nodiscard]] inline auto element_sum(T_RegisterType src) noexcept -> ElementType<T_RegisterType>;


 template <UST t_num_elements, FloatVectorRegister T_RegisterType>

 [[nodiscard]] inline auto element_sum_first_n(T_RegisterType src) noexcept -> ElementType<T_RegisterType>;


 // ---internal declarations -------------------------------------------------------------------------------------------


 namespace internal

 {

 template <UST t_num_elements, DoublePrecisionVectorRegister T_RegisterType>

 [[nodiscard]] inline auto element_sum_first_n(T_RegisterType src) noexcept -> ElementType<T_RegisterType>;


 template <UST t_num_elements, SinglePrecisionVectorRegister T_RegisterType>

 [[nodiscard]] inline auto element_sum_first_n(T_RegisterType src) noexcept -> ElementType<T_RegisterType>;

 } // namespace internal


 } // namespace mjolnir::x86


 // === DEFINITIONS ====================================================================================================


 #include "mjolnir/core/x86/intrinsics.h"

 #include "mjolnir/core/x86/permutation.h"

 #include "mjolnir/core/x86/x86.h"


 namespace mjolnir::x86

 {

 // --------------------------------------------------------------------------------------------------------------------


 template <FloatVectorRegister T_RegisterType>

 [[nodiscard]] inline auto broadcast_element_sum(T_RegisterType src) noexcept -> T_RegisterType

 {

     if constexpr (is_single_precision<T_RegisterType>)

     {

         T_RegisterType sum = mm_add(src, permute<1, 0, 3, 2>(src));

         sum                = mm_add(sum, permute<2, 3, 0, 1>(sum));


         if constexpr (is_avx_register<T_RegisterType>)

             sum = mm_add(sum, swap_lanes(sum));


         return sum;

     }

     else

     {

         T_RegisterType sum = mm_add(src, permute<1, 0>(src));


         if constexpr (is_avx_register<T_RegisterType>)

             sum = mm_add(sum, swap_lanes(sum));


         return sum;

     }

 }


 // --------------------------------------------------------------------------------------------------------------------


 template <FloatVectorRegister T_RegisterType>

 [[nodiscard]] inline auto element_sum(T_RegisterType src) noexcept -> ElementType<T_RegisterType>

 {

     T_RegisterType sum = broadcast_element_sum(src);

     return mm_cvt_float(sum);

 }


 // --------------------------------------------------------------------------------------------------------------------


 template <UST t_num_elements, FloatVectorRegister T_RegisterType>

 [[nodiscard]] inline auto element_sum_first_n(T_RegisterType src) noexcept -> ElementType<T_RegisterType>

 {

     constexpr UST n_e = num_elements<T_RegisterType>;


     static_assert(t_num_elements > 0, "`t_num_elements` must be larger than 0.");

     static_assert(t_num_elements <= n_e, "`t_num_elements` must be less or equal to the number of register elements.");


     return internal::element_sum_first_n<t_num_elements, T_RegisterType>(src);

 }


 // --- internal definitions -------------------------------------------------------------------------------------------


 namespace internal

 {

 // --------------------------------------------------------------------------------------------------------------------


 template <UST t_num_elements, DoublePrecisionVectorRegister T_RegisterType>

 [[nodiscard]] inline auto element_sum_first_n(T_RegisterType src) noexcept -> ElementType<T_RegisterType>

 {

     if constexpr (t_num_elements == 1)

         return mm_cvt_float(src);

     else if constexpr (t_num_elements == num_elements<T_RegisterType>)

         return element_sum(src);

     else

     {

         T_RegisterType sum = mm_add(src, permute<1, 0>(src));


         if constexpr (t_num_elements == 3)

             sum = mm_add(sum, swap_lanes(src));


         return mm_cvt_float(sum);

     }

 }


 // --------------------------------------------------------------------------------------------------------------------


 template <UST t_num_elements, SinglePrecisionVectorRegister T_RegisterType>

 [[nodiscard]] inline auto element_sum_first_n(T_RegisterType src) noexcept -> ElementType<T_RegisterType>

 {

     if constexpr (t_num_elements == 1)

         return mm_cvt_float(src);

     else if constexpr (t_num_elements == num_elements<T_RegisterType>)

         return element_sum(src);

     else if constexpr (t_num_elements == 7) // NOLINT(readability-magic-numbers)

     {

         auto zero = mm_setzero<__m256>();

         return element_sum(blend_at<t_num_elements>(src, zero));

     }

     else

     {

         T_RegisterType sum = mm_add(src, permute<1, 0, 3, 2>(src));


         if constexpr (t_num_elements == 3)

             sum = mm_add(sum, broadcast<2>(src));


         if constexpr (t_num_elements == 4 || t_num_elements == 5) // NOLINT(readability-magic-numbers)

         {

             sum = mm_add(sum, permute<2, 3, 0, 1>(sum));


             if constexpr (t_num_elements == 5) // NOLINT(readability-magic-numbers)

                 sum = mm_add(sum, swap_lanes(src));

         }


         if constexpr (t_num_elements == 6) // NOLINT(readability-magic-numbers, readability-misleading-indentation)

         {

             __m256 tmp = sum;

             sum        = mm_add(sum, permute<2, 3, 0, 1>(sum));

             sum        = mm_add(sum, swap_lanes(tmp));

         }


         return mm_cvt_float(sum); // NOLINT(readability-misleading-indentation)

     }

 }

 } // namespace internal


 } // namespace mjolnir::x86

mjolnir::UST
std::size_t UST
Unsigned integer type that is returned by sizeof operations.
Definition: fundamental_types.h:29

mjolnir::x86::mm_cvt_float
auto mm_cvt_float(T_RegisterType src) -> ElementType< T_RegisterType >
Return the first element of src.
Definition: intrinsics.h:739

mjolnir::x86::SinglePrecisionVectorRegister
concept SinglePrecisionVectorRegister
Concept for a x86 vector register that has single precision elements.
Definition: definitions.h:66

mjolnir::x86::ElementType
typename std::conditional_t< is_any_of< T_RegisterType, __m128d, __m256d >(), F64, F32 > ElementType
The element type of an x86 vector register that is based on floating-point types.
Definition: definitions.h:212

mjolnir::x86::broadcast
auto broadcast(T_RegisterType src) noexcept -> T_RegisterType
Broadcast a register element per lane selected by t_index.
Definition: permutation.h:562

mjolnir::x86::broadcast_element_sum
auto broadcast_element_sum(T_RegisterType src) noexcept -> T_RegisterType
Calculate the sum of all elements of src, broadcast it into a new register and return the result.
Definition: element_summation.h:97

mjolnir::x86::element_sum_first_n
auto element_sum_first_n(T_RegisterType src) noexcept -> ElementType< T_RegisterType >
Return the sum of the first t_num_elements elements from src.
Definition: element_summation.h:134

mjolnir::x86::num_elements
constexpr UST num_elements
Number of register elements.
Definition: definitions.h:257

mjolnir::x86::mm_add
auto mm_add(T_RegisterType lhs, T_RegisterType rhs) noexcept -> T_RegisterType
Perform an element-wise addition of lhs and rhs and return the result.
Definition: intrinsics.h:544

mjolnir::x86::element_sum
auto element_sum(T_RegisterType src) noexcept -> ElementType< T_RegisterType >
Return the sum of all elements from src.
Definition: element_summation.h:124

mjolnir::x86::swap_lanes
auto swap_lanes(T_RegisterType src) noexcept -> T_RegisterType
Swap the lanes of an AVX register and return the result.
Definition: permutation.h:911

intrinsics.h
Contains generalized/template versions of the x86 intrinsics.

permutation.h
Contains functions to permute and blend the elements of vector registers.

definitions.h
Contains x86 vectorization specific constants, concepts and definitions.

x86.h
This header includes the correct x86 header depending on the operation system.