eigen/test/packetmath.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1812 lines
75 KiB
C++
Raw Permalink Normal View History

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
2010-06-25 05:21:58 +08:00
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
2008-11-24 21:40:43 +08:00
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "packetmath_test_shared.h"
#include "random_without_cast_overflow.h"
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types. This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
2020-05-12 04:23:31 +08:00
template <typename T>
inline T REF_ADD(const T& a, const T& b) {
return a + b;
}
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types. This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
2020-05-12 04:23:31 +08:00
template <typename T>
inline T REF_SUB(const T& a, const T& b) {
return a - b;
}
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types. This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
2020-05-12 04:23:31 +08:00
template <typename T>
inline T REF_MUL(const T& a, const T& b) {
return a * b;
}
template <typename Scalar, typename EnableIf = void>
struct madd_impl {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar madd(const Scalar& a, const Scalar& b, const Scalar& c) {
return a * b + c;
}
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar msub(const Scalar& a, const Scalar& b, const Scalar& c) {
return a * b - c;
}
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar nmadd(const Scalar& a, const Scalar& b, const Scalar& c) {
return c - a * b;
}
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar nmsub(const Scalar& a, const Scalar& b, const Scalar& c) {
return Scalar(0) - (a * b + c);
}
};
template <typename Scalar>
struct madd_impl<Scalar,
std::enable_if_t<Eigen::internal::is_scalar<Scalar>::value && Eigen::NumTraits<Scalar>::IsSigned>> {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar madd(const Scalar& a, const Scalar& b, const Scalar& c) {
return numext::madd(a, b, c);
}
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar msub(const Scalar& a, const Scalar& b, const Scalar& c) {
return numext::madd(a, b, Scalar(-c));
}
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar nmadd(const Scalar& a, const Scalar& b, const Scalar& c) {
return numext::madd(Scalar(-a), b, c);
}
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar nmsub(const Scalar& a, const Scalar& b, const Scalar& c) {
return -Scalar(numext::madd(a, b, c));
}
};
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types. This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
2020-05-12 04:23:31 +08:00
template <typename T>
inline T REF_MADD(const T& a, const T& b, const T& c) {
return madd_impl<T>::madd(a, b, c);
}
template <typename T>
inline T REF_MSUB(const T& a, const T& b, const T& c) {
return madd_impl<T>::msub(a, b, c);
}
template <typename T>
inline T REF_NMADD(const T& a, const T& b, const T& c) {
return madd_impl<T>::nmadd(a, b, c);
}
template <typename T>
inline T REF_NMSUB(const T& a, const T& b, const T& c) {
return madd_impl<T>::nmsub(a, b, c);
}
template <typename T>
inline T REF_DIV(const T& a, const T& b) {
return a / b;
}
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types. This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
2020-05-12 04:23:31 +08:00
template <typename T>
inline T REF_RECIPROCAL(const T& a) {
return T(1) / a;
}
template <typename T>
inline T REF_ABS_DIFF(const T& a, const T& b) {
return a > b ? a - b : b - a;
}
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types. This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
2020-05-12 04:23:31 +08:00
// Specializations for bool.
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types. This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
2020-05-12 04:23:31 +08:00
template <>
inline bool REF_ADD(const bool& a, const bool& b) {
return a || b;
}
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types. This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
2020-05-12 04:23:31 +08:00
template <>
inline bool REF_SUB(const bool& a, const bool& b) {
return a ^ b;
}
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types. This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
2020-05-12 04:23:31 +08:00
template <>
inline bool REF_MUL(const bool& a, const bool& b) {
return a && b;
}
template <>
inline bool REF_MADD(const bool& a, const bool& b, const bool& c) {
return (a && b) || c;
}
template <>
inline bool REF_DIV(const bool& a, const bool& b) {
return a && b;
}
template <>
inline bool REF_RECIPROCAL(const bool& a) {
return a;
}
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types. This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
2020-05-12 04:23:31 +08:00
template <typename T>
inline T REF_FREXP(const T& x, T& exp) {
int iexp = 0;
EIGEN_USING_STD(frexp)
const T out = static_cast<T>(frexp(x, &iexp));
exp = static_cast<T>(iexp);
2022-02-25 06:16:37 +08:00
// The exponent value is unspecified if the input is inf or NaN, but MSVC
// seems to set it to 1. We need to set it back to zero for consistency.
if (!(numext::isfinite)(x)) {
exp = T(0);
}
return out;
}
template <typename T>
inline T REF_LDEXP(const T& x, const T& exp) {
EIGEN_USING_STD(ldexp)
return static_cast<T>(ldexp(x, static_cast<int>(exp)));
}
2025-03-13 01:06:32 +08:00
// provides a convenient function to take the absolute value of each component of a complex number to prevent
// catastrophic cancellation in randomly generated complex numbers
template <typename T, bool IsComplex = NumTraits<T>::IsComplex>
struct abs_helper_impl {
static T run(T x) { return numext::abs(x); }
};
template <typename T>
struct abs_helper_impl<T, true> {
static T run(T x) {
T res = x;
numext::real_ref(res) = numext::abs(numext::real(res));
numext::imag_ref(res) = numext::abs(numext::imag(res));
return res;
}
};
template <typename T>
T abs_helper(T x) {
return abs_helper_impl<T>::run(x);
}
// Uses pcast to cast from one array to another.
template <typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
struct pcast_array;
template <typename SrcPacket, typename TgtPacket, int TgtCoeffRatio>
struct pcast_array<SrcPacket, TgtPacket, 1, TgtCoeffRatio> {
typedef typename internal::unpacket_traits<SrcPacket>::type SrcScalar;
typedef typename internal::unpacket_traits<TgtPacket>::type TgtScalar;
static void cast(const SrcScalar* src, size_t size, TgtScalar* dst) {
static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
size_t i;
for (i = 0; i < size && i + SrcPacketSize <= size; i += TgtPacketSize) {
internal::pstoreu(dst + i, internal::pcast<SrcPacket, TgtPacket>(internal::ploadu<SrcPacket>(src + i)));
}
// Leftovers that cannot be loaded into a packet.
for (; i < size; ++i) {
dst[i] = static_cast<TgtScalar>(src[i]);
}
}
};
template <typename SrcPacket, typename TgtPacket>
struct pcast_array<SrcPacket, TgtPacket, 2, 1> {
static void cast(const typename internal::unpacket_traits<SrcPacket>::type* src, size_t size,
typename internal::unpacket_traits<TgtPacket>::type* dst) {
static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
for (size_t i = 0; i < size; i += TgtPacketSize) {
SrcPacket a = internal::ploadu<SrcPacket>(src + i);
SrcPacket b = internal::ploadu<SrcPacket>(src + i + SrcPacketSize);
internal::pstoreu(dst + i, internal::pcast<SrcPacket, TgtPacket>(a, b));
}
}
};
template <typename SrcPacket, typename TgtPacket>
struct pcast_array<SrcPacket, TgtPacket, 4, 1> {
static void cast(const typename internal::unpacket_traits<SrcPacket>::type* src, size_t size,
typename internal::unpacket_traits<TgtPacket>::type* dst) {
static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
for (size_t i = 0; i < size; i += TgtPacketSize) {
SrcPacket a = internal::ploadu<SrcPacket>(src + i);
SrcPacket b = internal::ploadu<SrcPacket>(src + i + SrcPacketSize);
SrcPacket c = internal::ploadu<SrcPacket>(src + i + 2 * SrcPacketSize);
SrcPacket d = internal::ploadu<SrcPacket>(src + i + 3 * SrcPacketSize);
internal::pstoreu(dst + i, internal::pcast<SrcPacket, TgtPacket>(a, b, c, d));
}
}
};
template <typename SrcPacket, typename TgtPacket>
struct pcast_array<SrcPacket, TgtPacket, 8, 1> {
static void cast(const typename internal::unpacket_traits<SrcPacket>::type* src, size_t size,
typename internal::unpacket_traits<TgtPacket>::type* dst) {
static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
for (size_t i = 0; i < size; i += TgtPacketSize) {
SrcPacket a = internal::ploadu<SrcPacket>(src + i);
SrcPacket b = internal::ploadu<SrcPacket>(src + i + SrcPacketSize);
SrcPacket c = internal::ploadu<SrcPacket>(src + i + 2 * SrcPacketSize);
SrcPacket d = internal::ploadu<SrcPacket>(src + i + 3 * SrcPacketSize);
SrcPacket e = internal::ploadu<SrcPacket>(src + i + 4 * SrcPacketSize);
SrcPacket f = internal::ploadu<SrcPacket>(src + i + 5 * SrcPacketSize);
SrcPacket g = internal::ploadu<SrcPacket>(src + i + 6 * SrcPacketSize);
SrcPacket h = internal::ploadu<SrcPacket>(src + i + 7 * SrcPacketSize);
internal::pstoreu(dst + i, internal::pcast<SrcPacket, TgtPacket>(a, b, c, d, e, f, g, h));
}
}
};
template <typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio, bool CanCast = false>
2020-03-27 04:18:19 +08:00
struct test_cast_helper;
template <typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
struct test_cast_helper<SrcPacket, TgtPacket, SrcCoeffRatio, TgtCoeffRatio, false> {
2020-03-27 04:18:19 +08:00
static void run() {}
};
template <typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
struct test_cast_helper<SrcPacket, TgtPacket, SrcCoeffRatio, TgtCoeffRatio, true> {
2020-03-27 04:18:19 +08:00
static void run() {
typedef typename internal::unpacket_traits<SrcPacket>::type SrcScalar;
typedef typename internal::unpacket_traits<TgtPacket>::type TgtScalar;
static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
static const int BlockSize = SrcPacketSize * SrcCoeffRatio;
eigen_assert(BlockSize == TgtPacketSize * TgtCoeffRatio && "Packet sizes and cast ratios are mismatched.");
static const int DataSize = 10 * BlockSize;
EIGEN_ALIGN_MAX SrcScalar data1[DataSize];
EIGEN_ALIGN_MAX TgtScalar data2[DataSize];
EIGEN_ALIGN_MAX TgtScalar ref[DataSize];
2020-03-27 04:18:19 +08:00
// Construct a packet of scalars that will not overflow when casting
for (int i = 0; i < DataSize; ++i) {
data1[i] = internal::random_without_cast_overflow<SrcScalar, TgtScalar>::value();
2020-03-27 04:18:19 +08:00
}
for (int i = 0; i < DataSize; ++i) {
ref[i] = static_cast<const TgtScalar>(data1[i]);
}
2020-03-27 04:18:19 +08:00
pcast_array<SrcPacket, TgtPacket, SrcCoeffRatio, TgtCoeffRatio>::cast(data1, DataSize, data2);
VERIFY(test::areApprox(ref, data2, DataSize) && "internal::pcast<>");
2024-11-26 14:27:26 +08:00
// Test that pcast<SrcScalar, TgtScalar> generates the same result.
for (int i = 0; i < DataSize; ++i) {
data2[i] = internal::pcast<SrcScalar, TgtScalar>(data1[i]);
}
VERIFY(test::areApprox(ref, data2, DataSize) && "internal::pcast<>");
2020-03-27 04:18:19 +08:00
}
};
template <typename SrcPacket, typename TgtPacket>
struct test_cast {
static void run() {
typedef typename internal::unpacket_traits<SrcPacket>::type SrcScalar;
typedef typename internal::unpacket_traits<TgtPacket>::type TgtScalar;
typedef typename internal::type_casting_traits<SrcScalar, TgtScalar> TypeCastingTraits;
static const int SrcCoeffRatio = TypeCastingTraits::SrcCoeffRatio;
static const int TgtCoeffRatio = TypeCastingTraits::TgtCoeffRatio;
static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
static const bool HasCast =
internal::unpacket_traits<SrcPacket>::vectorizable && internal::unpacket_traits<TgtPacket>::vectorizable &&
TypeCastingTraits::VectorizedCast && (SrcPacketSize * SrcCoeffRatio == TgtPacketSize * TgtCoeffRatio);
test_cast_helper<SrcPacket, TgtPacket, SrcCoeffRatio, TgtCoeffRatio, HasCast>::run();
}
};
template <typename SrcPacket, typename TgtScalar,
typename TgtPacket = typename internal::packet_traits<TgtScalar>::type,
bool Vectorized = internal::packet_traits<TgtScalar>::Vectorizable,
bool HasHalf = !internal::is_same<typename internal::unpacket_traits<TgtPacket>::half, TgtPacket>::value>
struct test_cast_runner;
template <typename SrcPacket, typename TgtScalar, typename TgtPacket>
struct test_cast_runner<SrcPacket, TgtScalar, TgtPacket, true, false> {
static void run() { test_cast<SrcPacket, TgtPacket>::run(); }
};
template <typename SrcPacket, typename TgtScalar, typename TgtPacket>
struct test_cast_runner<SrcPacket, TgtScalar, TgtPacket, true, true> {
static void run() {
test_cast<SrcPacket, TgtPacket>::run();
test_cast_runner<SrcPacket, TgtScalar, typename internal::unpacket_traits<TgtPacket>::half>::run();
}
};
template <typename SrcPacket, typename TgtScalar, typename TgtPacket>
struct test_cast_runner<SrcPacket, TgtScalar, TgtPacket, false, false> {
static void run() {}
};
template <typename Scalar, typename Packet, typename EnableIf = void>
struct packetmath_pcast_ops_runner {
static void run() {
test_cast_runner<Packet, float>::run();
test_cast_runner<Packet, double>::run();
test_cast_runner<Packet, int8_t>::run();
test_cast_runner<Packet, uint8_t>::run();
test_cast_runner<Packet, int16_t>::run();
test_cast_runner<Packet, uint16_t>::run();
test_cast_runner<Packet, int32_t>::run();
test_cast_runner<Packet, uint32_t>::run();
test_cast_runner<Packet, int64_t>::run();
test_cast_runner<Packet, uint64_t>::run();
test_cast_runner<Packet, bool>::run();
2020-07-02 02:41:59 +08:00
test_cast_runner<Packet, std::complex<float>>::run();
test_cast_runner<Packet, std::complex<double>>::run();
test_cast_runner<Packet, half>::run();
test_cast_runner<Packet, bfloat16>::run();
}
};
// Only some types support cast from std::complex<>.
template <typename Scalar, typename Packet>
struct packetmath_pcast_ops_runner<Scalar, Packet, std::enable_if_t<NumTraits<Scalar>::IsComplex>> {
static void run() {
2020-07-02 02:41:59 +08:00
test_cast_runner<Packet, std::complex<float>>::run();
test_cast_runner<Packet, std::complex<double>>::run();
test_cast_runner<Packet, half>::run();
test_cast_runner<Packet, bfloat16>::run();
}
};
2020-03-27 04:18:19 +08:00
template <typename Scalar, typename Packet>
void packetmath_boolean_mask_ops() {
2024-03-05 01:44:38 +08:00
using RealScalar = typename NumTraits<Scalar>::Real;
const int PacketSize = internal::unpacket_traits<Packet>::size;
const int size = 2 * PacketSize;
EIGEN_ALIGN_MAX Scalar data1[size];
EIGEN_ALIGN_MAX Scalar data2[size];
EIGEN_ALIGN_MAX Scalar ref[size];
for (int i = 0; i < size; ++i) {
data1[i] = internal::random<Scalar>();
}
CHECK_CWISE1_MASK(internal::ptrue, internal::ptrue);
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types. This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
2020-05-12 04:23:31 +08:00
CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot);
for (int i = 0; i < PacketSize; ++i) {
2024-03-05 01:44:38 +08:00
data1[i] = Scalar(RealScalar(i));
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types. This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
2020-05-12 04:23:31 +08:00
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_MASK(internal::pcmp_eq, internal::pcmp_eq);
// Test (-0) == (0) for signed operations
for (int i = 0; i < PacketSize; ++i) {
data1[i] = Scalar(-0.0);
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_MASK(internal::pcmp_eq, internal::pcmp_eq);
// Test NaN
for (int i = 0; i < PacketSize; ++i) {
data1[i] = NumTraits<Scalar>::quiet_NaN();
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_MASK(internal::pcmp_eq, internal::pcmp_eq);
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types. This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
2020-05-12 04:23:31 +08:00
}
template <typename Scalar, typename Packet>
void packetmath_boolean_mask_ops_real() {
const int PacketSize = internal::unpacket_traits<Packet>::size;
const int size = 2 * PacketSize;
EIGEN_ALIGN_MAX Scalar data1[size];
EIGEN_ALIGN_MAX Scalar data2[size];
for (int i = 0; i < PacketSize; ++i) {
data1[i] = internal::random<Scalar>();
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_MASK(internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
// Test (-0) <=/< (0) for signed operations
for (int i = 0; i < PacketSize; ++i) {
data1[i] = Scalar(-0.0);
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_MASK(internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
// Test NaN
for (int i = 0; i < PacketSize; ++i) {
data1[i] = NumTraits<Scalar>::quiet_NaN();
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_MASK(internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
}
template <typename Scalar, typename Packet, typename EnableIf = void>
struct packetmath_boolean_mask_ops_notcomplex_test {
static void run() {}
};
2021-04-24 03:51:43 +08:00
template <typename Scalar, typename Packet>
struct packetmath_boolean_mask_ops_notcomplex_test<
Scalar, Packet,
std::enable_if_t<internal::packet_traits<Scalar>::HasCmp && !internal::is_same<Scalar, bool>::value>> {
static void run() {
const int PacketSize = internal::unpacket_traits<Packet>::size;
const int size = 2 * PacketSize;
EIGEN_ALIGN_MAX Scalar data1[size];
EIGEN_ALIGN_MAX Scalar data2[size];
2021-04-24 03:51:43 +08:00
for (int i = 0; i < PacketSize; ++i) {
data1[i] = internal::random<Scalar>();
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
2021-04-24 03:51:43 +08:00
CHECK_CWISE2_MASK(internal::pcmp_le, internal::pcmp_le);
CHECK_CWISE2_MASK(internal::pcmp_lt, internal::pcmp_lt);
2021-04-24 03:51:43 +08:00
// Test (-0) <=/< (0) for signed operations
for (int i = 0; i < PacketSize; ++i) {
data1[i] = Scalar(-0.0);
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_MASK(internal::pcmp_le, internal::pcmp_le);
CHECK_CWISE2_MASK(internal::pcmp_lt, internal::pcmp_lt);
2021-04-24 03:51:43 +08:00
// Test NaN
for (int i = 0; i < PacketSize; ++i) {
data1[i] = NumTraits<Scalar>::quiet_NaN();
data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
}
CHECK_CWISE2_MASK(internal::pcmp_le, internal::pcmp_le);
CHECK_CWISE2_MASK(internal::pcmp_lt, internal::pcmp_lt);
2021-04-24 03:51:43 +08:00
}
};
2021-04-24 03:51:43 +08:00
template <typename Scalar, typename Packet, typename EnableIf = void>
struct packetmath_minus_zero_add_test {
static void run() {}
};
template <typename Scalar, typename Packet>
struct packetmath_minus_zero_add_test<Scalar, Packet, std::enable_if_t<!NumTraits<Scalar>::IsInteger>> {
static void run() {
const int PacketSize = internal::unpacket_traits<Packet>::size;
const int size = 2 * PacketSize;
EIGEN_ALIGN_MAX Scalar data1[size] = {};
EIGEN_ALIGN_MAX Scalar data2[size] = {};
EIGEN_ALIGN_MAX Scalar ref[size] = {};
for (int i = 0; i < PacketSize; ++i) {
data1[i] = Scalar(-0.0);
data1[i + PacketSize] = Scalar(-0.0);
}
CHECK_CWISE2_IF(internal::packet_traits<Scalar>::HasAdd, REF_ADD, internal::padd);
}
};
// Ensure optimization barrier compiles and doesn't modify contents.
// Only applies to raw types, so will not work for std::complex, Eigen::half
// or Eigen::bfloat16. For those you would need to refer to an underlying
// storage element.
template <typename Packet, typename EnableIf = void>
struct eigen_optimization_barrier_test {
static void run() {}
};
template <typename Packet>
struct eigen_optimization_barrier_test<
Packet, std::enable_if_t<!NumTraits<Packet>::IsComplex && !internal::is_same<Packet, Eigen::half>::value &&
!internal::is_same<Packet, Eigen::bfloat16>::value>> {
static void run() {
typedef typename internal::unpacket_traits<Packet>::type Scalar;
Scalar s = internal::random<Scalar>();
Packet barrier = internal::pset1<Packet>(s);
EIGEN_OPTIMIZATION_BARRIER(barrier);
eigen_assert(s == internal::pfirst(barrier) && "EIGEN_OPTIMIZATION_BARRIER");
}
};
template <typename Scalar, typename Packet, bool HasNegate = internal::packet_traits<Scalar>::HasNegate>
struct negate_test_impl {
static void run_negate(Scalar* data1, Scalar* data2, Scalar* ref, int PacketSize) {
CHECK_CWISE1_IF(HasNegate, test::negate, internal::pnegate);
}
static void run_nmsub(Scalar* data1, Scalar* data2, Scalar* ref, int PacketSize) {
CHECK_CWISE3_IF(HasNegate, REF_NMSUB, internal::pnmsub);
}
};
template <typename Scalar, typename Packet>
struct negate_test_impl<Scalar, Packet, false> {
static void run_negate(Scalar*, Scalar*, Scalar*, int) {}
static void run_nmsub(Scalar*, Scalar*, Scalar*, int) {}
};
template <typename Scalar, typename Packet>
void negate_test(Scalar* data1, Scalar* data2, Scalar* ref, int size) {
negate_test_impl<Scalar, Packet>::run_negate(data1, data2, ref, size);
}
template <typename Scalar, typename Packet>
void nmsub_test(Scalar* data1, Scalar* data2, Scalar* ref, int size) {
negate_test_impl<Scalar, Packet>::run_nmsub(data1, data2, ref, size);
}
template <typename Scalar, typename Packet>
void packetmath() {
typedef internal::packet_traits<Scalar> PacketTraits;
const int PacketSize = internal::unpacket_traits<Packet>::size;
typedef typename NumTraits<Scalar>::Real RealScalar;
if (g_first_pass)
std::cerr << "=== Testing packet of type '" << typeid(Packet).name() << "' and scalar type '"
<< typeid(Scalar).name() << "' and size '" << PacketSize << "' ===\n";
2014-01-30 03:43:05 +08:00
const int max_size = PacketSize > 4 ? PacketSize : 4;
const int size = PacketSize * max_size;
EIGEN_ALIGN_MAX Scalar data1[size];
EIGEN_ALIGN_MAX Scalar data2[size];
Adding lowlevel APIs for optimized RHS packet load in TensorFlow SpatialConvolution Low-level APIs are added in order to optimized packet load in gemm_pack_rhs in TensorFlow SpatialConvolution. The optimization is for scenario when a packet is split across 2 adjacent columns. In this case we read it as two 'partial' packets and then merge these into 1. Currently this only works for Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other packet types (such as Packet8d) also. This optimization shows significant speedup in SpatialConvolution with certain parameters. Some examples are below. Benchmark parameters are specified as: Batch size, Input dim, Depth, Num of filters, Filter dim Speedup numbers are specified for number of threads 1, 2, 4, 8, 16. AVX512: Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16) ----------------------------|------------------------------------------ 128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X 128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X 32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X 128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X 32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X 128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X AVX2: Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16) ----------------------------|------------------------------------------ 128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X 32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X 128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X 128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X 128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X In the higher level benchmark cifar10, we observe a runtime improvement of around 6% for AVX512 on Intel Skylake server (8 cores). On lower level PackRhs micro-benchmarks specified in TensorFlow tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe the following runtime numbers: AVX512: Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup ---------------------------------------------------------------|----------------------------|-------------------------|--------- BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X AVX2: Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup ---------------------------------------------------------------|----------------------------|-------------------------|--------- BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
EIGEN_ALIGN_MAX Scalar data3[size];
EIGEN_ALIGN_MAX Scalar ref[size];
RealScalar refvalue = RealScalar(0);
eigen_optimization_barrier_test<Packet>::run();
eigen_optimization_barrier_test<Scalar>::run();
for (int i = 0; i < size; ++i) {
data1[i] = internal::random<Scalar>();
data2[i] = internal::random<Scalar>();
refvalue = (std::max)(refvalue, numext::abs(data1[i]));
}
internal::pstore(data2, internal::pload<Packet>(data1));
VERIFY(test::areApprox(data1, data2, PacketSize) && "aligned load/store");
for (int offset = 0; offset < PacketSize; ++offset) {
internal::pstore(data2, internal::ploadu<Packet>(data1 + offset));
VERIFY(test::areApprox(data1 + offset, data2, PacketSize) && "internal::ploadu");
}
for (int offset = 0; offset < PacketSize; ++offset) {
internal::pstoreu(data2 + offset, internal::pload<Packet>(data1));
VERIFY(test::areApprox(data1, data2 + offset, PacketSize) && "internal::pstoreu");
}
for (int M = 0; M < PacketSize; ++M) {
for (int N = 0; N <= PacketSize; ++N) {
for (int j = 0; j < size; ++j) {
data1[j] = internal::random<Scalar>();
data2[j] = internal::random<Scalar>();
refvalue = (std::max)(refvalue, numext::abs(data1[j]));
}
if (M == 0) {
internal::pstore_partial(data2, internal::pload_partial<Packet>(data1, N), N);
VERIFY(test::areApprox(data1, data2, N) && "aligned loadN/storeN");
for (int offset = 0; offset < PacketSize; ++offset) {
internal::pstore_partial(data2, internal::ploadu_partial<Packet>(data1 + offset, N), N);
VERIFY(test::areApprox(data1 + offset, data2, N) && "internal::ploadu_partial");
}
for (int offset = 0; offset < PacketSize; ++offset) {
internal::pstoreu_partial(data2 + offset, internal::pload_partial<Packet>(data1, N), N);
VERIFY(test::areApprox(data1, data2 + offset, N) && "internal::pstoreu_partial");
}
}
if (N + M > PacketSize) continue; // Don't read or write past end of Packet
internal::pstore_partial(data2, internal::pload_partial<Packet>(data1, N, M), N, M);
VERIFY(test::areApprox(data1, data2, N) && "aligned offset loadN/storeN");
}
}
if (internal::unpacket_traits<Packet>::masked_load_available) {
test::packet_helper<internal::unpacket_traits<Packet>::masked_load_available, Packet> h;
Adding lowlevel APIs for optimized RHS packet load in TensorFlow SpatialConvolution Low-level APIs are added in order to optimized packet load in gemm_pack_rhs in TensorFlow SpatialConvolution. The optimization is for scenario when a packet is split across 2 adjacent columns. In this case we read it as two 'partial' packets and then merge these into 1. Currently this only works for Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other packet types (such as Packet8d) also. This optimization shows significant speedup in SpatialConvolution with certain parameters. Some examples are below. Benchmark parameters are specified as: Batch size, Input dim, Depth, Num of filters, Filter dim Speedup numbers are specified for number of threads 1, 2, 4, 8, 16. AVX512: Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16) ----------------------------|------------------------------------------ 128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X 128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X 32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X 128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X 32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X 128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X AVX2: Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16) ----------------------------|------------------------------------------ 128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X 32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X 128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X 128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X 128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X In the higher level benchmark cifar10, we observe a runtime improvement of around 6% for AVX512 on Intel Skylake server (8 cores). On lower level PackRhs micro-benchmarks specified in TensorFlow tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe the following runtime numbers: AVX512: Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup ---------------------------------------------------------------|----------------------------|-------------------------|--------- BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X AVX2: Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup ---------------------------------------------------------------|----------------------------|-------------------------|--------- BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
unsigned long long max_umask = (0x1ull << PacketSize);
for (int offset = 0; offset < PacketSize; ++offset) {
for (unsigned long long umask = 0; umask < max_umask; ++umask) {
h.store(data2, h.load(data1 + offset, umask));
for (int k = 0; k < PacketSize; ++k) data3[k] = ((umask & (0x1ull << k)) >> k) ? data1[k + offset] : Scalar(0);
VERIFY(test::areApprox(data3, data2, PacketSize) && "internal::ploadu masked");
Adding lowlevel APIs for optimized RHS packet load in TensorFlow SpatialConvolution Low-level APIs are added in order to optimized packet load in gemm_pack_rhs in TensorFlow SpatialConvolution. The optimization is for scenario when a packet is split across 2 adjacent columns. In this case we read it as two 'partial' packets and then merge these into 1. Currently this only works for Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other packet types (such as Packet8d) also. This optimization shows significant speedup in SpatialConvolution with certain parameters. Some examples are below. Benchmark parameters are specified as: Batch size, Input dim, Depth, Num of filters, Filter dim Speedup numbers are specified for number of threads 1, 2, 4, 8, 16. AVX512: Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16) ----------------------------|------------------------------------------ 128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X 128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X 32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X 128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X 32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X 128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X AVX2: Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16) ----------------------------|------------------------------------------ 128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X 32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X 128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X 128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X 128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X In the higher level benchmark cifar10, we observe a runtime improvement of around 6% for AVX512 on Intel Skylake server (8 cores). On lower level PackRhs micro-benchmarks specified in TensorFlow tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe the following runtime numbers: AVX512: Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup ---------------------------------------------------------------|----------------------------|-------------------------|--------- BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X AVX2: Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup ---------------------------------------------------------------|----------------------------|-------------------------|--------- BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
}
}
}
if (internal::unpacket_traits<Packet>::masked_store_available) {
test::packet_helper<internal::unpacket_traits<Packet>::masked_store_available, Packet> h;
unsigned long long max_umask = (0x1ull << PacketSize);
for (int offset = 0; offset < PacketSize; ++offset) {
for (unsigned long long umask = 0; umask < max_umask; ++umask) {
internal::pstore(data2, internal::pset1<Packet>(Scalar(0)));
h.store(data2, h.loadu(data1 + offset), umask);
for (int k = 0; k < PacketSize; ++k) data3[k] = ((umask & (0x1ull << k)) >> k) ? data1[k + offset] : Scalar(0);
VERIFY(test::areApprox(data3, data2, PacketSize) && "internal::pstoreu masked");
}
}
Adding lowlevel APIs for optimized RHS packet load in TensorFlow SpatialConvolution Low-level APIs are added in order to optimized packet load in gemm_pack_rhs in TensorFlow SpatialConvolution. The optimization is for scenario when a packet is split across 2 adjacent columns. In this case we read it as two 'partial' packets and then merge these into 1. Currently this only works for Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other packet types (such as Packet8d) also. This optimization shows significant speedup in SpatialConvolution with certain parameters. Some examples are below. Benchmark parameters are specified as: Batch size, Input dim, Depth, Num of filters, Filter dim Speedup numbers are specified for number of threads 1, 2, 4, 8, 16. AVX512: Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16) ----------------------------|------------------------------------------ 128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X 128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X 32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X 128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X 32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X 128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X AVX2: Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16) ----------------------------|------------------------------------------ 128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X 32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X 128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X 128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X 128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X In the higher level benchmark cifar10, we observe a runtime improvement of around 6% for AVX512 on Intel Skylake server (8 cores). On lower level PackRhs micro-benchmarks specified in TensorFlow tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe the following runtime numbers: AVX512: Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup ---------------------------------------------------------------|----------------------------|-------------------------|--------- BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X AVX2: Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup ---------------------------------------------------------------|----------------------------|-------------------------|--------- BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
}
VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasAdd);
VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasSub);
VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMul);
CHECK_CWISE2_IF(PacketTraits::HasAdd, REF_ADD, internal::padd);
CHECK_CWISE2_IF(PacketTraits::HasSub, REF_SUB, internal::psub);
CHECK_CWISE2_IF(PacketTraits::HasMul, REF_MUL, internal::pmul);
CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv);
negate_test<Scalar, Packet>(data1, data2, ref, PacketSize);
CHECK_CWISE1_IF(PacketTraits::HasReciprocal, REF_RECIPROCAL, internal::preciprocal);
CHECK_CWISE1(numext::conj, internal::pconj);
2022-08-10 03:54:57 +08:00
CHECK_CWISE1_IF(PacketTraits::HasSign, numext::sign, internal::psign);
for (int offset = 0; offset < 3; ++offset) {
for (int i = 0; i < PacketSize; ++i) ref[i] = data1[offset];
2011-02-24 02:24:26 +08:00
internal::pstore(data2, internal::pset1<Packet>(data1[offset]));
VERIFY(test::areApprox(ref, data2, PacketSize) && "internal::pset1");
2011-02-24 02:24:26 +08:00
}
2014-04-25 17:21:18 +08:00
{
for (int i = 0; i < PacketSize * 4; ++i) ref[i] = data1[i / PacketSize];
2014-04-25 17:21:18 +08:00
Packet A0, A1, A2, A3;
2014-04-25 17:46:22 +08:00
internal::pbroadcast4<Packet>(data1, A0, A1, A2, A3);
internal::pstore(data2 + 0 * PacketSize, A0);
internal::pstore(data2 + 1 * PacketSize, A1);
internal::pstore(data2 + 2 * PacketSize, A2);
internal::pstore(data2 + 3 * PacketSize, A3);
VERIFY(test::areApprox(ref, data2, 4 * PacketSize) && "internal::pbroadcast4");
2014-04-25 17:21:18 +08:00
}
2014-04-25 17:21:18 +08:00
{
for (int i = 0; i < PacketSize * 2; ++i) ref[i] = data1[i / PacketSize];
2014-05-05 21:03:29 +08:00
Packet A0, A1;
2014-04-25 17:46:22 +08:00
internal::pbroadcast2<Packet>(data1, A0, A1);
internal::pstore(data2 + 0 * PacketSize, A0);
internal::pstore(data2 + 1 * PacketSize, A1);
VERIFY(test::areApprox(ref, data2, 2 * PacketSize) && "internal::pbroadcast2");
2014-04-25 17:21:18 +08:00
}
VERIFY(internal::isApprox(data1[0], internal::pfirst(internal::pload<Packet>(data1))) && "internal::pfirst");
if (PacketSize > 1) {
// apply different offsets to check that ploaddup is robust to unaligned inputs
for (int offset = 0; offset < 4; ++offset) {
for (int i = 0; i < PacketSize / 2; ++i) ref[2 * i + 0] = ref[2 * i + 1] = data1[offset + i];
internal::pstore(data2, internal::ploaddup<Packet>(data1 + offset));
VERIFY(test::areApprox(ref, data2, PacketSize) && "ploaddup");
2011-02-24 05:22:10 +08:00
}
}
if (PacketSize > 2) {
// apply different offsets to check that ploadquad is robust to unaligned inputs
for (int offset = 0; offset < 4; ++offset) {
for (int i = 0; i < PacketSize / 4; ++i)
ref[4 * i + 0] = ref[4 * i + 1] = ref[4 * i + 2] = ref[4 * i + 3] = data1[offset + i];
internal::pstore(data2, internal::ploadquad<Packet>(data1 + offset));
VERIFY(test::areApprox(ref, data2, PacketSize) && "ploadquad");
}
}
ref[0] = Scalar(0);
for (int i = 0; i < PacketSize; ++i) ref[0] += data1[i];
VERIFY(test::isApproxAbs(ref[0], internal::predux(internal::pload<Packet>(data1)), refvalue) && "internal::predux");
if (!internal::is_same<Packet, typename internal::unpacket_traits<Packet>::half>::value) {
int HalfPacketSize = PacketSize > 4 ? PacketSize / 2 : PacketSize;
for (int i = 0; i < HalfPacketSize; ++i) ref[i] = Scalar(0);
for (int i = 0; i < PacketSize; ++i) ref[i % HalfPacketSize] += data1[i];
internal::pstore(data2, internal::predux_half(internal::pload<Packet>(data1)));
VERIFY(test::areApprox(ref, data2, HalfPacketSize) && "internal::predux_half");
}
2009-03-10 02:40:09 +08:00
2024-04-26 23:28:03 +08:00
// Avoid overflows.
if (NumTraits<Scalar>::IsInteger && NumTraits<Scalar>::IsSigned &&
Eigen::internal::unpacket_traits<Packet>::size > 1) {
Scalar limit = static_cast<Scalar>(
static_cast<RealScalar>(std::pow(static_cast<double>(numext::real(NumTraits<Scalar>::highest())),
1.0 / static_cast<double>(Eigen::internal::unpacket_traits<Packet>::size))));
2024-04-26 23:28:03 +08:00
for (int i = 0; i < PacketSize; ++i) {
data1[i] = internal::random<Scalar>(Scalar(0) - limit, limit);
2024-04-26 23:28:03 +08:00
}
} else if (!NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex && !std::is_same<Scalar, bool>::value) {
// Prevent very small product results by adjusting range. Otherwise,
// we may end up with multiplying e.g. 32 Eigen::halfs with values < 1.
for (int i = 0; i < PacketSize; ++i) {
2025-06-06 06:48:57 +08:00
data1[i] = REF_MUL(internal::random<Scalar>(Scalar(0.5), Scalar(1)),
(internal::random<bool>() ? Scalar(-1) : Scalar(1)));
}
2024-04-26 23:28:03 +08:00
}
ref[0] = Scalar(1);
for (int i = 0; i < PacketSize; ++i) ref[0] = REF_MUL(ref[0], data1[i]);
VERIFY(internal::isApprox(ref[0], internal::predux_mul(internal::pload<Packet>(data1))) && "internal::predux_mul");
2009-03-10 02:40:09 +08:00
for (int i = 0; i < PacketSize; ++i) ref[i] = data1[PacketSize - i - 1];
internal::pstore(data2, internal::preverse(internal::pload<Packet>(data1)));
VERIFY(test::areApprox(ref, data2, PacketSize) && "internal::preverse");
internal::PacketBlock<Packet> kernel;
for (int i = 0; i < PacketSize; ++i) {
kernel.packet[i] = internal::pload<Packet>(data1 + i * PacketSize);
}
ptranspose(kernel);
for (int i = 0; i < PacketSize; ++i) {
internal::pstore(data2, kernel.packet[i]);
for (int j = 0; j < PacketSize; ++j) {
VERIFY(test::isApproxAbs(data2[j], data1[i + j * PacketSize], refvalue) && "ptranspose");
}
}
// GeneralBlockPanelKernel also checks PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize>;
if (PacketSize > 4 && PacketSize % 4 == 0) {
internal::PacketBlock<Packet, PacketSize % 4 == 0 ? 4 : PacketSize> kernel2;
for (int i = 0; i < 4; ++i) {
kernel2.packet[i] = internal::pload<Packet>(data1 + i * PacketSize);
}
ptranspose(kernel2);
int data_counter = 0;
for (int i = 0; i < PacketSize; ++i) {
for (int j = 0; j < 4; ++j) {
data2[data_counter++] = data1[j * PacketSize + i];
}
}
for (int i = 0; i < 4; ++i) {
internal::pstore(data3, kernel2.packet[i]);
for (int j = 0; j < PacketSize; ++j) {
VERIFY(test::isApproxAbs(data3[j], data2[i * PacketSize + j], refvalue) && "ptranspose");
}
}
}
{
for (int i = 0; i < PacketSize; ++i) {
// "if" mask
2024-02-10 03:46:45 +08:00
// Note: it's UB to load 0xFF directly into a `bool`.
uint8_t v =
internal::random<bool>() ? (std::is_same<Scalar, bool>::value ? static_cast<uint8_t>(true) : 0xff) : 0;
// Avoid strict aliasing violation by using memset.
memset(static_cast<void*>(data1 + i), v, sizeof(Scalar));
// "then" packet
data1[i + PacketSize] = internal::random<Scalar>();
// "else" packet
data1[i + 2 * PacketSize] = internal::random<Scalar>();
}
CHECK_CWISE3_IF(true, internal::pselect, internal::pselect);
}
for (int i = 0; i < size; ++i) {
data1[i] = internal::random<Scalar>();
}
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types. This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
2020-05-12 04:23:31 +08:00
CHECK_CWISE1(internal::pzero, internal::pzero);
CHECK_CWISE2_IF(true, internal::por, internal::por);
CHECK_CWISE2_IF(true, internal::pxor, internal::pxor);
CHECK_CWISE2_IF(true, internal::pand, internal::pand);
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types. This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
2020-05-12 04:23:31 +08:00
packetmath_boolean_mask_ops<Scalar, Packet>();
packetmath_pcast_ops_runner<Scalar, Packet>::run();
packetmath_minus_zero_add_test<Scalar, Packet>::run();
Implement vectorized complex square root. Closes #1905 Measured speedup for sqrt of `complex<float>` on Skylake: SSE: ``` name old time/op new time/op delta BM_eigen_sqrt_ctype/1 49.4ns ± 0% 54.3ns ± 0% +10.01% BM_eigen_sqrt_ctype/8 332ns ± 0% 50ns ± 1% -84.97% BM_eigen_sqrt_ctype/64 2.81µs ± 1% 0.38µs ± 0% -86.49% BM_eigen_sqrt_ctype/512 23.8µs ± 0% 3.0µs ± 0% -87.32% BM_eigen_sqrt_ctype/4k 202µs ± 0% 24µs ± 2% -88.03% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.19ms ± 0% -88.18% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 1.5ms ± 1% -88.20% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 6.2ms ± 0% -88.18% ``` AVX2: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.6ns ± 0% 55.6ns ± 0% +3.71% BM_eigen_sqrt_ctype/8 334ns ± 0% 27ns ± 0% -91.86% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.22µs ± 2% -92.28% BM_eigen_sqrt_ctype/512 23.8µs ± 1% 1.7µs ± 1% -92.81% BM_eigen_sqrt_ctype/4k 201µs ± 0% 14µs ± 1% -93.24% BM_eigen_sqrt_ctype/32k 1.62ms ± 0% 0.11ms ± 1% -93.29% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.9ms ± 1% -93.31% BM_eigen_sqrt_ctype/1M 52.0ms ± 0% 3.5ms ± 1% -93.31% ``` AVX512: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.7ns ± 0% 56.2ns ± 1% +4.75% BM_eigen_sqrt_ctype/8 334ns ± 0% 18ns ± 2% -94.63% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.12µs ± 1% -95.54% BM_eigen_sqrt_ctype/512 23.9µs ± 1% 1.0µs ± 1% -95.89% BM_eigen_sqrt_ctype/4k 202µs ± 0% 8µs ± 1% -96.13% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.06ms ± 1% -96.15% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.5ms ± 4% -96.11% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 2.0ms ± 1% -96.13% ```
2020-12-09 10:13:35 +08:00
CHECK_CWISE3_IF(true, REF_MADD, internal::pmadd);
2022-08-03 01:42:45 +08:00
if (!std::is_same<Scalar, bool>::value && NumTraits<Scalar>::IsSigned) {
nmsub_test<Scalar, Packet>(data1, data2, ref, PacketSize);
2022-08-03 01:42:45 +08:00
}
2022-08-03 01:42:45 +08:00
// For pmsub, pnmadd, the values can cancel each other to become near zero,
// which can lead to very flaky tests. Here we ensure the signs are such that
// they do not cancel.
for (int i = 0; i < PacketSize; ++i) {
2025-03-13 01:06:32 +08:00
data1[i] = abs_helper(internal::random<Scalar>());
data1[i + PacketSize] = abs_helper(internal::random<Scalar>());
data1[i + 2 * PacketSize] = Scalar(0) - abs_helper(internal::random<Scalar>());
2022-08-03 01:42:45 +08:00
}
2022-01-27 10:20:03 +08:00
if (!std::is_same<Scalar, bool>::value && NumTraits<Scalar>::IsSigned) {
CHECK_CWISE3_IF(true, REF_MSUB, internal::pmsub);
CHECK_CWISE3_IF(true, REF_NMADD, internal::pnmadd);
}
2025-03-13 01:06:32 +08:00
CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
CHECK_CWISE1_IF(PacketTraits::HasRsqrt, numext::rsqrt, internal::prsqrt);
2025-04-18 07:31:20 +08:00
CHECK_CWISE1_IF(PacketTraits::HasCbrt, numext::cbrt, internal::pcbrt);
}
2020-12-05 05:45:09 +08:00
// Notice that this definition works for complex types as well.
// c++11 has std::log2 for real, but not for complex types.
template <typename Scalar>
Scalar log2(Scalar x) {
return Scalar(EIGEN_LOG2E) * std::log(x);
2020-12-05 05:45:09 +08:00
}
template <typename Scalar, typename Packet>
void packetmath_real() {
typedef internal::packet_traits<Scalar> PacketTraits;
const int PacketSize = internal::unpacket_traits<Packet>::size;
const int size = PacketSize * 4;
EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4] = {};
EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4] = {};
EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4] = {};
// Negate with -0.
if (PacketTraits::HasNegate) {
test::packet_helper<PacketTraits::HasNegate, Packet> h;
data1[0] = Scalar{-0};
h.store(data2, internal::pnegate(h.load(data1)));
typedef typename internal::make_unsigned<typename internal::make_integer<Scalar>::type>::type Bits;
Bits bits = numext::bit_cast<Bits>(data2[0]);
VERIFY_IS_EQUAL(bits, static_cast<Bits>(Bits(1) << (sizeof(Scalar) * CHAR_BIT - 1)));
}
for (int i = 0; i < size; ++i) {
data1[i] = Scalar(internal::random<double>(0, 1) * std::pow(10., internal::random<double>(-6, 6)));
data2[i] = Scalar(internal::random<double>(0, 1) * std::pow(10., internal::random<double>(-6, 6)));
}
if (internal::random<float>(0, 1) < 0.1f) data1[internal::random<int>(0, PacketSize)] = Scalar(0);
CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog);
2020-12-05 05:45:09 +08:00
CHECK_CWISE1_IF(PacketTraits::HasLog, log2, internal::plog2);
CHECK_CWISE1_IF(PacketTraits::HasRsqrt, numext::rsqrt, internal::prsqrt);
for (int i = 0; i < size; ++i) {
data1[i] = Scalar(internal::random<double>(-1, 1) * std::pow(10., internal::random<double>(-3, 3)));
data2[i] = Scalar(internal::random<double>(-1, 1) * std::pow(10., internal::random<double>(-3, 3)));
}
CHECK_CWISE1_IF(PacketTraits::HasSin, std::sin, internal::psin);
CHECK_CWISE1_IF(PacketTraits::HasCos, std::cos, internal::pcos);
CHECK_CWISE1_IF(PacketTraits::HasTan, std::tan, internal::ptan);
CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::round, internal::pround);
2024-04-30 07:45:49 +08:00
CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::ceil, internal::pceil);
CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::floor, internal::pfloor);
CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::rint, internal::print);
CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::trunc, internal::ptrunc);
2022-08-10 03:54:57 +08:00
CHECK_CWISE1_IF(PacketTraits::HasSign, numext::sign, internal::psign);
packetmath_boolean_mask_ops_real<Scalar, Packet>();
// Rounding edge cases.
2024-04-30 07:45:49 +08:00
if (PacketTraits::HasRound) {
typedef typename internal::make_integer<Scalar>::type IntType;
// Start with values that cannot fit inside an integer, work down to less than one.
Scalar val =
numext::mini(Scalar(2) * static_cast<Scalar>(NumTraits<IntType>::highest()), NumTraits<Scalar>::highest());
std::vector<Scalar> values;
while (val > Scalar(0.25)) {
// Cover both even and odd, positive and negative cases.
values.push_back(val);
values.push_back(val + Scalar(0.3));
values.push_back(val + Scalar(0.5));
values.push_back(val + Scalar(0.8));
values.push_back(val + Scalar(1));
values.push_back(val + Scalar(1.3));
values.push_back(val + Scalar(1.5));
values.push_back(val + Scalar(1.8));
values.push_back(-val);
values.push_back(-val - Scalar(0.3));
values.push_back(-val - Scalar(0.5));
values.push_back(-val - Scalar(0.8));
values.push_back(-val - Scalar(1));
values.push_back(-val - Scalar(1.3));
values.push_back(-val - Scalar(1.5));
values.push_back(-val - Scalar(1.8));
values.push_back(Scalar(-1.5) + val); // Bug 1785.
val = val / Scalar(2);
}
values.push_back(NumTraits<Scalar>::infinity());
values.push_back(-NumTraits<Scalar>::infinity());
values.push_back(NumTraits<Scalar>::quiet_NaN());
for (size_t k = 0; k < values.size(); ++k) {
data1[0] = values[k];
CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::round, internal::pround);
2024-04-30 07:45:49 +08:00
CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::ceil, internal::pceil);
CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::floor, internal::pfloor);
CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::rint, internal::print);
CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::trunc, internal::ptrunc);
}
}
for (int i = 0; i < size; ++i) {
data1[i] = Scalar(internal::random<double>(-1, 1));
data2[i] = Scalar(internal::random<double>(-1, 1));
}
CHECK_CWISE1_IF(PacketTraits::HasASin, std::asin, internal::pasin);
CHECK_CWISE1_IF(PacketTraits::HasACos, std::acos, internal::pacos);
CHECK_CWISE1_IF(PacketTraits::HasATan, std::atan, internal::patan);
CHECK_CWISE1_IF(PacketTraits::HasATanh, std::atanh, internal::patanh);
for (int i = 0; i < size; ++i) {
data1[i] = Scalar(internal::random<double>(-87, 88));
data2[i] = Scalar(internal::random<double>(-87, 88));
data1[0] = -NumTraits<Scalar>::infinity();
}
CHECK_CWISE1_IF(PacketTraits::HasExp, std::exp, internal::pexp);
CHECK_CWISE1_IF(PacketTraits::HasExp, std::exp2, internal::pexp2);
CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
if (PacketTraits::HasExp) {
// Check denormals:
#if !EIGEN_ARCH_ARM
for (int j = 0; j < 3; ++j) {
data1[0] = Scalar(std::ldexp(1, NumTraits<Scalar>::min_exponent() - j));
CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
data1[0] = -data1[0];
CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
}
#endif
// zero
data1[0] = Scalar(0);
CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
// inf and NaN only compare output fraction, not exponent.
test::packet_helper<PacketTraits::HasExp, Packet> h;
Packet pout;
Scalar sout;
Scalar special[] = {NumTraits<Scalar>::infinity(), -NumTraits<Scalar>::infinity(), NumTraits<Scalar>::quiet_NaN()};
for (int i = 0; i < 3; ++i) {
data1[0] = special[i];
ref[0] = Scalar(REF_FREXP(data1[0], ref[PacketSize]));
h.store(data2, internal::pfrexp(h.load(data1), h.forward_reference(pout, sout)));
VERIFY(test::areApprox(ref, data2, 1) && "internal::pfrexp");
}
}
for (int i = 0; i < PacketSize; ++i) {
data1[i] = Scalar(internal::random<double>(-1, 1));
data2[i] = Scalar(internal::random<double>(-1, 1));
}
for (int i = 0; i < PacketSize; ++i) {
data1[i + PacketSize] = Scalar(internal::random<int>(-4, 4));
data2[i + PacketSize] = Scalar(internal::random<double>(-4, 4));
}
CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
if (PacketTraits::HasExp) {
data1[0] = Scalar(-1);
// underflow to zero
data1[PacketSize] = Scalar(NumTraits<Scalar>::min_exponent() - 55);
CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
// overflow to inf
data1[PacketSize] = Scalar(NumTraits<Scalar>::max_exponent() + 10);
CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
// NaN stays NaN
data1[0] = NumTraits<Scalar>::quiet_NaN();
CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
VERIFY((numext::isnan)(data2[0]));
// inf stays inf
data1[0] = NumTraits<Scalar>::infinity();
data1[PacketSize] = Scalar(NumTraits<Scalar>::min_exponent() - 10);
CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
// zero stays zero
data1[0] = Scalar(0);
data1[PacketSize] = Scalar(NumTraits<Scalar>::max_exponent() + 10);
CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
// Small number big exponent.
data1[0] = Scalar(std::ldexp(Scalar(1.0), NumTraits<Scalar>::min_exponent() - 1));
data1[PacketSize] = Scalar(-NumTraits<Scalar>::min_exponent() + NumTraits<Scalar>::max_exponent());
CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
// Big number small exponent.
data1[0] = Scalar(std::ldexp(Scalar(1.0), NumTraits<Scalar>::max_exponent() - 1));
data1[PacketSize] = Scalar(+NumTraits<Scalar>::min_exponent() - NumTraits<Scalar>::max_exponent());
CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
}
for (int i = 0; i < size; ++i) {
data1[i] = Scalar(internal::random<double>(-1, 1) * std::pow(10., internal::random<double>(-6, 6)));
data2[i] = Scalar(internal::random<double>(-1, 1) * std::pow(10., internal::random<double>(-6, 6)));
2016-02-11 09:41:47 +08:00
}
data1[0] = Scalar(1e-20);
2016-02-11 09:41:47 +08:00
CHECK_CWISE1_IF(PacketTraits::HasTanh, std::tanh, internal::ptanh);
if (PacketTraits::HasExp && PacketSize >= 2) {
const Scalar small = NumTraits<Scalar>::epsilon();
data1[0] = NumTraits<Scalar>::quiet_NaN();
data1[1] = small;
test::packet_helper<PacketTraits::HasExp, Packet> h;
h.store(data2, internal::pexp(h.load(data1)));
VERIFY((numext::isnan)(data2[0]));
VERIFY_IS_APPROX(std::exp(small), data2[1]);
data1[0] = -small;
data1[1] = Scalar(0);
h.store(data2, internal::pexp(h.load(data1)));
VERIFY_IS_APPROX(std::exp(-small), data2[0]);
VERIFY_IS_EQUAL(std::exp(Scalar(0)), data2[1]);
data1[0] = (std::numeric_limits<Scalar>::min)();
data1[1] = -(std::numeric_limits<Scalar>::min)();
h.store(data2, internal::pexp(h.load(data1)));
VERIFY_IS_APPROX(std::exp((std::numeric_limits<Scalar>::min)()), data2[0]);
VERIFY_IS_APPROX(std::exp(-(std::numeric_limits<Scalar>::min)()), data2[1]);
data1[0] = std::numeric_limits<Scalar>::denorm_min();
data1[1] = -std::numeric_limits<Scalar>::denorm_min();
h.store(data2, internal::pexp(h.load(data1)));
VERIFY_IS_APPROX(std::exp(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
VERIFY_IS_APPROX(std::exp(-std::numeric_limits<Scalar>::denorm_min()), data2[1]);
}
2016-05-11 07:21:43 +08:00
if (PacketTraits::HasTanh) {
2016-09-22 17:18:52 +08:00
// NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details.
data1[0] = NumTraits<Scalar>::quiet_NaN();
test::packet_helper<internal::packet_traits<Scalar>::HasTanh, Packet> h;
2016-05-11 07:21:43 +08:00
h.store(data2, internal::ptanh(h.load(data1)));
VERIFY((numext::isnan)(data2[0]));
}
if (PacketTraits::HasExp) {
internal::scalar_logistic_op<Scalar> logistic;
for (int i = 0; i < size; ++i) {
data1[i] = Scalar(internal::random<double>(-20, 20));
}
test::packet_helper<PacketTraits::HasExp, Packet> h;
h.store(data2, logistic.packetOp(h.load(data1)));
for (int i = 0; i < PacketSize; ++i) {
VERIFY_IS_APPROX(data2[i], logistic(data1[i]));
}
}
#if EIGEN_HAS_C99_MATH
data1[0] = NumTraits<Scalar>::infinity();
data1[1] = Scalar(-1);
CHECK_CWISE1_IF(PacketTraits::HasLog1p, std::log1p, internal::plog1p);
data1[0] = NumTraits<Scalar>::infinity();
data1[1] = -NumTraits<Scalar>::infinity();
CHECK_CWISE1_IF(PacketTraits::HasExpm1, std::expm1, internal::pexpm1);
#endif
if (PacketSize >= 2) {
data1[0] = NumTraits<Scalar>::quiet_NaN();
data1[1] = NumTraits<Scalar>::epsilon();
if (PacketTraits::HasLog) {
test::packet_helper<PacketTraits::HasLog, Packet> h;
h.store(data2, internal::plog(h.load(data1)));
VERIFY((numext::isnan)(data2[0]));
// TODO(cantonios): Re-enable for bfloat16.
if (!internal::is_same<Scalar, bfloat16>::value) {
VERIFY_IS_APPROX(std::log(data1[1]), data2[1]);
}
data1[0] = -NumTraits<Scalar>::epsilon();
data1[1] = Scalar(0);
h.store(data2, internal::plog(h.load(data1)));
VERIFY((numext::isnan)(data2[0]));
VERIFY_IS_EQUAL(std::log(Scalar(0)), data2[1]);
data1[0] = (std::numeric_limits<Scalar>::min)();
data1[1] = -(std::numeric_limits<Scalar>::min)();
h.store(data2, internal::plog(h.load(data1)));
// TODO(cantonios): Re-enable for bfloat16.
if (!internal::is_same<Scalar, bfloat16>::value) {
VERIFY_IS_APPROX(std::log((std::numeric_limits<Scalar>::min)()), data2[0]);
}
VERIFY((numext::isnan)(data2[1]));
// Note: 32-bit arm always flushes denorms to zero.
#if !EIGEN_ARCH_ARM
if (std::numeric_limits<Scalar>::has_denorm == std::denorm_present) {
data1[0] = std::numeric_limits<Scalar>::denorm_min();
data1[1] = -std::numeric_limits<Scalar>::denorm_min();
h.store(data2, internal::plog(h.load(data1)));
// TODO(rmlarsen): Re-enable for bfloat16.
if (!internal::is_same<Scalar, bfloat16>::value) {
VERIFY_IS_APPROX(std::log(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
}
VERIFY((numext::isnan)(data2[1]));
}
#endif
data1[0] = Scalar(-1.0f);
h.store(data2, internal::plog(h.load(data1)));
VERIFY((numext::isnan)(data2[0]));
data1[0] = NumTraits<Scalar>::infinity();
h.store(data2, internal::plog(h.load(data1)));
VERIFY((numext::isinf)(data2[0]));
}
if (PacketTraits::HasLog1p) {
test::packet_helper<PacketTraits::HasLog1p, Packet> h;
data1[0] = Scalar(-2);
data1[1] = -NumTraits<Scalar>::infinity();
h.store(data2, internal::plog1p(h.load(data1)));
VERIFY((numext::isnan)(data2[0]));
VERIFY((numext::isnan)(data2[1]));
}
// TODO(rmlarsen): Re-enable for half and bfloat16.
if (PacketTraits::HasCos && !internal::is_same<Scalar, half>::value &&
!internal::is_same<Scalar, bfloat16>::value) {
test::packet_helper<PacketTraits::HasCos, Packet> h;
for (Scalar k = Scalar(1); k < Scalar(10000) / NumTraits<Scalar>::epsilon(); k *= Scalar(2)) {
for (int k1 = 0; k1 <= 1; ++k1) {
data1[0] = Scalar((2 * double(k) + k1) * double(EIGEN_PI) / 2 * internal::random<double>(0.8, 1.2));
data1[1] = Scalar((2 * double(k) + 2 + k1) * double(EIGEN_PI) / 2 * internal::random<double>(0.8, 1.2));
h.store(data2, internal::pcos(h.load(data1)));
h.store(data2 + PacketSize, internal::psin(h.load(data1)));
VERIFY(data2[0] <= Scalar(1.) && data2[0] >= Scalar(-1.));
VERIFY(data2[1] <= Scalar(1.) && data2[1] >= Scalar(-1.));
VERIFY(data2[PacketSize + 0] <= Scalar(1.) && data2[PacketSize + 0] >= Scalar(-1.));
VERIFY(data2[PacketSize + 1] <= Scalar(1.) && data2[PacketSize + 1] >= Scalar(-1.));
VERIFY_IS_APPROX(data2[0], std::cos(data1[0]));
VERIFY_IS_APPROX(data2[1], std::cos(data1[1]));
VERIFY_IS_APPROX(data2[PacketSize + 0], std::sin(data1[0]));
VERIFY_IS_APPROX(data2[PacketSize + 1], std::sin(data1[1]));
VERIFY_IS_APPROX(numext::abs2(data2[0]) + numext::abs2(data2[PacketSize + 0]), Scalar(1));
VERIFY_IS_APPROX(numext::abs2(data2[1]) + numext::abs2(data2[PacketSize + 1]), Scalar(1));
}
}
data1[0] = NumTraits<Scalar>::infinity();
data1[1] = -NumTraits<Scalar>::infinity();
h.store(data2, internal::psin(h.load(data1)));
VERIFY((numext::isnan)(data2[0]));
VERIFY((numext::isnan)(data2[1]));
h.store(data2, internal::pcos(h.load(data1)));
VERIFY((numext::isnan)(data2[0]));
VERIFY((numext::isnan)(data2[1]));
data1[0] = NumTraits<Scalar>::quiet_NaN();
h.store(data2, internal::psin(h.load(data1)));
VERIFY((numext::isnan)(data2[0]));
h.store(data2, internal::pcos(h.load(data1)));
VERIFY((numext::isnan)(data2[0]));
data1[0] = -Scalar(0.);
h.store(data2, internal::psin(h.load(data1)));
2024-01-31 06:38:43 +08:00
VERIFY(test::biteq(data2[0], data1[0]));
h.store(data2, internal::pcos(h.load(data1)));
VERIFY_IS_EQUAL(data2[0], Scalar(1));
}
}
if (PacketTraits::HasReciprocal && PacketSize >= 2) {
test::packet_helper<PacketTraits::HasReciprocal, Packet> h;
const Scalar inf = NumTraits<Scalar>::infinity();
const Scalar zero = Scalar(0);
data1[0] = zero;
data1[1] = -zero;
h.store(data2, internal::preciprocal(h.load(data1)));
VERIFY_IS_EQUAL(data2[0], inf);
VERIFY_IS_EQUAL(data2[1], -inf);
data1[0] = inf;
data1[1] = -inf;
h.store(data2, internal::preciprocal(h.load(data1)));
VERIFY_IS_EQUAL(data2[0], zero);
VERIFY_IS_EQUAL(data2[1], -zero);
}
2013-03-21 01:28:40 +08:00
}
#define CAST_CHECK_CWISE1_IF(COND, REFOP, POP, SCALAR, REFTYPE) \
if (COND) { \
test::packet_helper<COND, Packet> h; \
for (int i = 0; i < PacketSize; ++i) ref[i] = SCALAR(REFOP(static_cast<REFTYPE>(data1[i]))); \
h.store(data2, POP(h.load(data1))); \
VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \
}
template <typename Scalar>
Scalar propagate_nan_max(const Scalar& a, const Scalar& b) {
if ((numext::isnan)(a)) return a;
if ((numext::isnan)(b)) return b;
return (numext::maxi)(a, b);
}
template <typename Scalar>
Scalar propagate_nan_min(const Scalar& a, const Scalar& b) {
if ((numext::isnan)(a)) return a;
if ((numext::isnan)(b)) return b;
return (numext::mini)(a, b);
}
template <typename Scalar>
Scalar propagate_number_max(const Scalar& a, const Scalar& b) {
if ((numext::isnan)(a)) return b;
if ((numext::isnan)(b)) return a;
return (numext::maxi)(a, b);
}
template <typename Scalar>
Scalar propagate_number_min(const Scalar& a, const Scalar& b) {
if ((numext::isnan)(a)) return b;
if ((numext::isnan)(b)) return a;
return (numext::mini)(a, b);
}
template <bool Cond, typename Scalar, typename Packet, bool SkipDenorms = false, typename FunctorT>
std::enable_if_t<!Cond, void> run_ieee_cases(const FunctorT&) {}
template <bool Cond, typename Scalar, typename Packet, bool SkipDenorms = false, typename FunctorT>
std::enable_if_t<Cond, void> run_ieee_cases(const FunctorT& fun) {
const int PacketSize = internal::unpacket_traits<Packet>::size;
const Scalar norm_min = (std::numeric_limits<Scalar>::min)();
const Scalar norm_max = (std::numeric_limits<Scalar>::max)();
const Scalar inf = (std::numeric_limits<Scalar>::infinity)();
const Scalar nan = (std::numeric_limits<Scalar>::quiet_NaN)();
std::vector<Scalar> values{norm_min, Scalar(0), Scalar(1), norm_max, inf, nan};
constexpr int size = PacketSize * 2;
EIGEN_ALIGN_MAX Scalar data1[size];
EIGEN_ALIGN_MAX Scalar data2[size];
EIGEN_ALIGN_MAX Scalar ref[size];
for (int i = 0; i < size; ++i) {
data1[i] = data2[i] = ref[i] = Scalar(0);
}
if (Cond && !SkipDenorms && std::numeric_limits<Scalar>::has_denorm == std::denorm_present) {
values.push_back(std::numeric_limits<Scalar>::denorm_min());
values.push_back(norm_min / Scalar(2));
}
for (Scalar abs_value : values) {
data1[0] = abs_value;
data1[1] = -data1[0];
g_test_stack.push_back("IEEE cases: " + fun.name);
CHECK_CWISE1_IF(Cond, fun.expected, fun.actual);
g_test_stack.pop_back();
}
}
// Create a tester struct with the actual and the reference function
// as templated member functions.
#define CREATE_TESTER(NAME, ACTUAL, EXPECTED) \
struct NAME { \
template <typename T> \
T actual(const T& val) const { \
return ACTUAL(val); \
} \
template <typename T> \
T expected(const T& val) const { \
return EXPECTED(val); \
} \
const std::string name = #NAME; \
}
CREATE_TESTER(sqrt_fun, internal::psqrt, numext::sqrt);
CREATE_TESTER(rsqrt_fun, internal::prsqrt, numext::rsqrt);
CREATE_TESTER(cbrt_fun, internal::pcbrt, numext::cbrt);
CREATE_TESTER(exp_fun, internal::pexp, numext::exp);
CREATE_TESTER(exp2_fun, internal::pexp2, numext::exp2);
CREATE_TESTER(log_fun, internal::plog, numext::log);
CREATE_TESTER(log2_fun, internal::plog2, numext::log2);
CREATE_TESTER(expm1_fun, internal::pexpm1, numext::expm1);
CREATE_TESTER(log1p_fun, internal::plog1p, numext::log1p);
CREATE_TESTER(sin_fun, internal::psin, numext::sin);
CREATE_TESTER(cos_fun, internal::pcos, numext::cos);
CREATE_TESTER(tan_fun, internal::ptan, numext::tan);
CREATE_TESTER(asin_fun, internal::pasin, numext::asin);
CREATE_TESTER(acos_fun, internal::pacos, numext::acos);
CREATE_TESTER(atan_fun, internal::patan, numext::atan);
CREATE_TESTER(tanh_fun, internal::ptanh, numext::tanh);
CREATE_TESTER(atanh_fun, internal::patanh, numext::atanh);
template <typename Scalar, typename Packet>
std::enable_if_t<NumTraits<Scalar>::IsComplex, void> packetmath_ieee_special_values() {}
template <typename Scalar, typename Packet>
std::enable_if_t<!NumTraits<Scalar>::IsComplex, void> packetmath_ieee_special_values() {
typedef internal::packet_traits<Scalar> PacketTraits;
run_ieee_cases<PacketTraits::HasSqrt, Scalar, Packet>(sqrt_fun());
// TODO(rmlarsen): See if we can fix rsqrt for denorms without wreaking performance.
run_ieee_cases<PacketTraits::HasRsqrt, Scalar, Packet, true>(rsqrt_fun());
run_ieee_cases<PacketTraits::HasCbrt, Scalar, Packet>(cbrt_fun());
run_ieee_cases<PacketTraits::HasExp, Scalar, Packet>(exp_fun());
run_ieee_cases<PacketTraits::HasExp, Scalar, Packet>(exp2_fun());
run_ieee_cases<PacketTraits::HasLog, Scalar, Packet>(log_fun());
run_ieee_cases<PacketTraits::HasLog, Scalar, Packet>(log2_fun());
run_ieee_cases<PacketTraits::HasExpm1, Scalar, Packet>(expm1_fun());
run_ieee_cases<PacketTraits::HasLog1p, Scalar, Packet>(log1p_fun());
run_ieee_cases<PacketTraits::HasSin, Scalar, Packet>(sin_fun());
run_ieee_cases<PacketTraits::HasCos, Scalar, Packet>(cos_fun());
run_ieee_cases<PacketTraits::HasTan, Scalar, Packet>(tan_fun());
run_ieee_cases<PacketTraits::HasASin, Scalar, Packet>(asin_fun());
run_ieee_cases<PacketTraits::HasACos, Scalar, Packet>(acos_fun());
run_ieee_cases<PacketTraits::HasATan, Scalar, Packet>(atan_fun());
run_ieee_cases<PacketTraits::HasTanh, Scalar, Packet>(tanh_fun());
run_ieee_cases<PacketTraits::HasATanh, Scalar, Packet>(atanh_fun());
}
template <typename Scalar, typename Packet>
void packetmath_notcomplex() {
packetmath_ieee_special_values<Scalar, Packet>();
typedef internal::packet_traits<Scalar> PacketTraits;
const int PacketSize = internal::unpacket_traits<Packet>::size;
2013-03-21 01:28:40 +08:00
EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4];
EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4];
EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4];
Array<Scalar, Dynamic, 1>::Map(data1, PacketSize * 4).setRandom();
VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMin);
VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMax);
CHECK_CWISE2_IF(PacketTraits::HasMin, (std::min), internal::pmin);
CHECK_CWISE2_IF(PacketTraits::HasMax, (std::max), internal::pmax);
CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_number_min, internal::pmin<PropagateNumbers>);
CHECK_CWISE2_IF(PacketTraits::HasMax, propagate_number_max, internal::pmax<PropagateNumbers>);
CHECK_CWISE1(numext::abs, internal::pabs);
// Vectorized versions may give a different result in the case of signed int overflow,
// which is undefined behavior (e.g. NEON).
// Also note that unsigned integers with size < sizeof(int) may be implicitly converted to a signed
// int, which can also trigger UB.
if (Eigen::NumTraits<Scalar>::IsInteger) {
for (int i = 0; i < 2 * PacketSize; ++i) {
data1[i] = data1[i] / Scalar(2);
}
}
CHECK_CWISE2_IF(PacketTraits::HasAbsDiff, REF_ABS_DIFF, internal::pabsdiff);
ref[0] = data1[0];
for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmin(ref[0], data1[i]);
VERIFY(internal::isApprox(ref[0], internal::predux_min(internal::pload<Packet>(data1))) && "internal::predux_min");
ref[0] = data1[0];
for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmax(ref[0], data1[i]);
VERIFY(internal::isApprox(ref[0], internal::predux_max(internal::pload<Packet>(data1))) && "internal::predux_max");
for (int i = 0; i < PacketSize; ++i) ref[i] = data1[0] + Scalar(i);
internal::pstore(data2, internal::plset<Packet>(data1[0]));
VERIFY(test::areApprox(ref, data2, PacketSize) && "internal::plset");
{
unsigned char* data1_bits = reinterpret_cast<unsigned char*>(data1);
// predux_all - not needed yet
// for (unsigned int i=0; i<PacketSize*sizeof(Scalar); ++i) data1_bits[i] = 0xff;
// VERIFY(internal::predux_all(internal::pload<Packet>(data1)) && "internal::predux_all(1111)");
// for(int k=0; k<PacketSize; ++k)
// {
// for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0x0;
// VERIFY( (!internal::predux_all(internal::pload<Packet>(data1))) && "internal::predux_all(0101)");
// for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0xff;
// }
// predux_any
for (unsigned int i = 0; i < PacketSize * sizeof(Scalar); ++i) data1_bits[i] = 0x0;
VERIFY((!internal::predux_any(internal::pload<Packet>(data1))) && "internal::predux_any(0000)");
for (int k = 0; k < PacketSize; ++k) {
for (unsigned int i = 0; i < sizeof(Scalar); ++i) data1_bits[k * sizeof(Scalar) + i] = 0xff;
VERIFY(internal::predux_any(internal::pload<Packet>(data1)) && "internal::predux_any(0101)");
for (unsigned int i = 0; i < sizeof(Scalar); ++i) data1_bits[k * sizeof(Scalar) + i] = 0x00;
}
}
// Test NaN propagation.
if (!NumTraits<Scalar>::IsInteger) {
// Test reductions with no NaNs.
ref[0] = data1[0];
for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmin<PropagateNumbers>(ref[0], data1[i]);
VERIFY(internal::isApprox(ref[0], internal::predux_min<PropagateNumbers>(internal::pload<Packet>(data1))) &&
"internal::predux_min<PropagateNumbers>");
ref[0] = data1[0];
for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmin<PropagateNaN>(ref[0], data1[i]);
VERIFY(internal::isApprox(ref[0], internal::predux_min<PropagateNaN>(internal::pload<Packet>(data1))) &&
"internal::predux_min<PropagateNaN>");
ref[0] = data1[0];
for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmax<PropagateNumbers>(ref[0], data1[i]);
VERIFY(internal::isApprox(ref[0], internal::predux_max<PropagateNumbers>(internal::pload<Packet>(data1))) &&
"internal::predux_max<PropagateNumbers>");
ref[0] = data1[0];
for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmax<PropagateNaN>(ref[0], data1[i]);
VERIFY(internal::isApprox(ref[0], internal::predux_max<PropagateNaN>(internal::pload<Packet>(data1))) &&
"internal::predux_max<PropagateNumbers>");
// A single NaN.
const size_t index = std::numeric_limits<size_t>::quiet_NaN() % PacketSize;
data1[index] = NumTraits<Scalar>::quiet_NaN();
VERIFY(PacketSize == 1 || !(numext::isnan)(internal::predux_min<PropagateNumbers>(internal::pload<Packet>(data1))));
VERIFY((numext::isnan)(internal::predux_min<PropagateNaN>(internal::pload<Packet>(data1))));
VERIFY(PacketSize == 1 || !(numext::isnan)(internal::predux_max<PropagateNumbers>(internal::pload<Packet>(data1))));
VERIFY((numext::isnan)(internal::predux_max<PropagateNaN>(internal::pload<Packet>(data1))));
// All NaNs.
for (int i = 0; i < 4 * PacketSize; ++i) data1[i] = NumTraits<Scalar>::quiet_NaN();
VERIFY((numext::isnan)(internal::predux_min<PropagateNumbers>(internal::pload<Packet>(data1))));
VERIFY((numext::isnan)(internal::predux_min<PropagateNaN>(internal::pload<Packet>(data1))));
VERIFY((numext::isnan)(internal::predux_max<PropagateNumbers>(internal::pload<Packet>(data1))));
VERIFY((numext::isnan)(internal::predux_max<PropagateNaN>(internal::pload<Packet>(data1))));
// Test NaN propagation for coefficient-wise min and max.
for (int i = 0; i < PacketSize; ++i) {
data1[i] = internal::random<bool>() ? NumTraits<Scalar>::quiet_NaN() : Scalar(0);
data1[i + PacketSize] = internal::random<bool>() ? NumTraits<Scalar>::quiet_NaN() : Scalar(0);
}
// Note: NaN propagation is implementation defined for pmin/pmax, so we do not test it here.
CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_number_min, (internal::pmin<PropagateNumbers>));
CHECK_CWISE2_IF(PacketTraits::HasMax, propagate_number_max, internal::pmax<PropagateNumbers>);
CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_nan_min, (internal::pmin<PropagateNaN>));
CHECK_CWISE2_IF(PacketTraits::HasMax, propagate_nan_max, internal::pmax<PropagateNaN>);
}
2021-04-24 03:51:43 +08:00
packetmath_boolean_mask_ops_notcomplex_test<Scalar, Packet>::run();
}
template <typename Scalar, typename Packet, bool ConjLhs, bool ConjRhs>
void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval) {
const int PacketSize = internal::unpacket_traits<Packet>::size;
2011-02-24 02:24:26 +08:00
internal::conj_if<ConjLhs> cj0;
internal::conj_if<ConjRhs> cj1;
internal::conj_helper<Scalar, Scalar, ConjLhs, ConjRhs> cj;
internal::conj_helper<Packet, Packet, ConjLhs, ConjRhs> pcj;
for (int i = 0; i < PacketSize; ++i) {
2011-02-24 02:24:26 +08:00
ref[i] = cj0(data1[i]) * cj1(data2[i]);
VERIFY(internal::isApprox(ref[i], cj.pmul(data1[i], data2[i])) && "conj_helper pmul");
2011-02-24 02:24:26 +08:00
}
internal::pstore(pval, pcj.pmul(internal::pload<Packet>(data1), internal::pload<Packet>(data2)));
VERIFY(test::areApprox(ref, pval, PacketSize) && "conj_helper pmul");
for (int i = 0; i < PacketSize; ++i) {
2011-02-24 02:24:26 +08:00
Scalar tmp = ref[i];
ref[i] += cj0(data1[i]) * cj1(data2[i]);
VERIFY(internal::isApprox(ref[i], cj.pmadd(data1[i], data2[i], tmp)) && "conj_helper pmadd");
2011-02-24 02:24:26 +08:00
}
internal::pstore(
pval, pcj.pmadd(internal::pload<Packet>(data1), internal::pload<Packet>(data2), internal::pload<Packet>(pval)));
VERIFY(test::areApprox(ref, pval, PacketSize) && "conj_helper pmadd");
2011-02-24 02:24:26 +08:00
}
2024-02-17 11:08:23 +08:00
template <typename Scalar, typename Packet, bool HasExp = internal::packet_traits<Scalar>::HasExp>
struct exp_complex_test_impl {
typedef typename Scalar::value_type RealScalar;
2024-03-05 01:44:38 +08:00
static Scalar pexp1(const Scalar& x) {
Packet px = internal::pset1<Packet>(x);
Packet py = internal::pexp(px);
return internal::pfirst(py);
}
static Scalar cis(const RealScalar& x) { return Scalar(numext::cos(x), numext::sin(x)); }
// Verify equality with signed zero.
2024-03-07 08:19:57 +08:00
static bool is_exactly_equal(RealScalar a, RealScalar b) {
2024-03-05 01:44:38 +08:00
// NaNs are always unsigned, and always compare not equal directly.
if ((numext::isnan)(a)) {
return (numext::isnan)(b);
}
2024-03-07 08:19:57 +08:00
2024-03-05 01:44:38 +08:00
RealScalar zero(0);
2024-03-07 08:19:57 +08:00
#ifdef EIGEN_ARCH_ARM
// ARM automatically flushes denormals to zero.
// Preserve sign by multiplying by +0.
if (numext::abs(a) < (std::numeric_limits<RealScalar>::min)()) {
a = a * zero;
}
if (numext::abs(b) < (std::numeric_limits<RealScalar>::min)()) {
b = b * zero;
}
#endif
// Signed zero.
2024-03-05 01:44:38 +08:00
if (a == zero) {
// Signs are either 0 or NaN, so verify that their comparisons to zero are equal.
return (a == b) && ((numext::signbit(a) == zero) == (numext::signbit(b) == zero));
}
// Allow _some_ tolerance.
return verifyIsApprox(a, b);
}
// Verify equality with signed zero.
static bool is_exactly_equal(const Scalar& a, const Scalar& b) {
bool result = is_exactly_equal(numext::real_ref(a), numext::real_ref(b)) &&
is_exactly_equal(numext::imag_ref(a), numext::imag_ref(b));
if (!result) {
std::cout << a << " != " << b << std::endl;
}
return result;
}
static bool is_sign_exp_unspecified(const Scalar& z) {
const RealScalar inf = std::numeric_limits<RealScalar>::infinity();
// If z is (-∞,±∞), the result is (±0,±0) (signs are unspecified)
if (numext::real_ref(z) == -inf && (numext::isinf)(numext::imag_ref(z))) {
return true;
}
// If z is (+∞,±∞), the result is (±∞,NaN) and FE_INVALID is raised (the sign of the real part is unspecified)
if (numext::real_ref(z) == +inf && (numext::isinf)(numext::imag_ref(z))) {
return true;
}
// If z is (-∞,NaN), the result is (±0,±0) (signs are unspecified)
if (numext::real_ref(z) == -inf && (numext::isnan)(numext::imag_ref(z))) {
return true;
}
// If z is (+∞,NaN), the result is (±∞,NaN) (the sign of the real part is unspecified)
if (numext::real_ref(z) == +inf && (numext::isnan)(numext::imag_ref(z))) {
return true;
}
return false;
}
static void run(Scalar* data1, Scalar* data2, Scalar* ref, int size) {
2024-02-17 11:08:23 +08:00
const int PacketSize = internal::unpacket_traits<Packet>::size;
for (int i = 0; i < size; ++i) {
data1[i] = Scalar(internal::random<RealScalar>(), internal::random<RealScalar>());
}
CHECK_CWISE1_N(std::exp, internal::pexp, size);
2024-03-05 01:44:38 +08:00
// Test all corner cases (and more).
const RealScalar edges[] = {RealScalar(0),
RealScalar(1),
RealScalar(2),
RealScalar(EIGEN_PI / 2),
RealScalar(EIGEN_PI),
RealScalar(3 * EIGEN_PI / 2),
RealScalar(2 * EIGEN_PI),
numext::log(NumTraits<RealScalar>::highest()) - 1,
NumTraits<RealScalar>::highest(),
std::numeric_limits<RealScalar>::infinity(),
std::numeric_limits<RealScalar>::quiet_NaN(),
-RealScalar(0),
-RealScalar(1),
-RealScalar(2),
-RealScalar(EIGEN_PI / 2),
-RealScalar(EIGEN_PI),
-RealScalar(3 * EIGEN_PI / 2),
-RealScalar(2 * EIGEN_PI),
-numext::log(NumTraits<RealScalar>::highest()) + 1,
-NumTraits<RealScalar>::highest(),
-std::numeric_limits<RealScalar>::infinity(),
-std::numeric_limits<RealScalar>::quiet_NaN()};
for (RealScalar x : edges) {
for (RealScalar y : edges) {
Scalar z = Scalar(x, y);
Scalar w = pexp1(z);
if (is_sign_exp_unspecified(z)) {
Scalar abs_w = Scalar(numext::abs(numext::real_ref(w)), numext::abs(numext::imag_ref(w)));
Scalar expected = numext::exp(z);
Scalar abs_expected =
Scalar(numext::abs(numext::real_ref(expected)), numext::abs(numext::imag_ref(expected)));
VERIFY(is_exactly_equal(abs_w, abs_expected));
} else {
VERIFY(is_exactly_equal(w, numext::exp(z)));
}
2024-02-17 11:08:23 +08:00
}
}
}
};
template <typename Scalar, typename Packet>
struct exp_complex_test_impl<Scalar, Packet, false> {
typedef typename Scalar::value_type RealScalar;
2024-11-26 14:27:26 +08:00
static void run(Scalar*, Scalar*, Scalar*, int) {};
2024-02-17 11:08:23 +08:00
};
template <typename Scalar, typename Packet>
void exp_complex_test(Scalar* data1, Scalar* data2, Scalar* ref, int size) {
2024-02-17 11:08:23 +08:00
exp_complex_test_impl<Scalar, Packet>::run(data1, data2, ref, size);
}
template <typename Scalar, typename Packet>
void packetmath_complex() {
Implement vectorized complex square root. Closes #1905 Measured speedup for sqrt of `complex<float>` on Skylake: SSE: ``` name old time/op new time/op delta BM_eigen_sqrt_ctype/1 49.4ns ± 0% 54.3ns ± 0% +10.01% BM_eigen_sqrt_ctype/8 332ns ± 0% 50ns ± 1% -84.97% BM_eigen_sqrt_ctype/64 2.81µs ± 1% 0.38µs ± 0% -86.49% BM_eigen_sqrt_ctype/512 23.8µs ± 0% 3.0µs ± 0% -87.32% BM_eigen_sqrt_ctype/4k 202µs ± 0% 24µs ± 2% -88.03% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.19ms ± 0% -88.18% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 1.5ms ± 1% -88.20% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 6.2ms ± 0% -88.18% ``` AVX2: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.6ns ± 0% 55.6ns ± 0% +3.71% BM_eigen_sqrt_ctype/8 334ns ± 0% 27ns ± 0% -91.86% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.22µs ± 2% -92.28% BM_eigen_sqrt_ctype/512 23.8µs ± 1% 1.7µs ± 1% -92.81% BM_eigen_sqrt_ctype/4k 201µs ± 0% 14µs ± 1% -93.24% BM_eigen_sqrt_ctype/32k 1.62ms ± 0% 0.11ms ± 1% -93.29% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.9ms ± 1% -93.31% BM_eigen_sqrt_ctype/1M 52.0ms ± 0% 3.5ms ± 1% -93.31% ``` AVX512: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.7ns ± 0% 56.2ns ± 1% +4.75% BM_eigen_sqrt_ctype/8 334ns ± 0% 18ns ± 2% -94.63% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.12µs ± 1% -95.54% BM_eigen_sqrt_ctype/512 23.9µs ± 1% 1.0µs ± 1% -95.89% BM_eigen_sqrt_ctype/4k 202µs ± 0% 8µs ± 1% -96.13% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.06ms ± 1% -96.15% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.5ms ± 4% -96.11% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 2.0ms ± 1% -96.13% ```
2020-12-09 10:13:35 +08:00
typedef internal::packet_traits<Scalar> PacketTraits;
typedef typename Scalar::value_type RealScalar;
const int PacketSize = internal::unpacket_traits<Packet>::size;
const int size = PacketSize * 4;
EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4];
EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4];
EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4];
EIGEN_ALIGN_MAX Scalar pval[PacketSize * 4];
2023-12-07 20:41:09 +08:00
EIGEN_ALIGN_MAX RealScalar realdata[PacketSize * 4];
EIGEN_ALIGN_MAX RealScalar realref[PacketSize * 4];
for (int i = 0; i < size; ++i) {
data1[i] = internal::random<Scalar>() * Scalar(1e2);
data2[i] = internal::random<Scalar>() * Scalar(1e2);
}
test_conj_helper<Scalar, Packet, false, false>(data1, data2, ref, pval);
test_conj_helper<Scalar, Packet, false, true>(data1, data2, ref, pval);
test_conj_helper<Scalar, Packet, true, false>(data1, data2, ref, pval);
test_conj_helper<Scalar, Packet, true, true>(data1, data2, ref, pval);
Implement vectorized complex square root. Closes #1905 Measured speedup for sqrt of `complex<float>` on Skylake: SSE: ``` name old time/op new time/op delta BM_eigen_sqrt_ctype/1 49.4ns ± 0% 54.3ns ± 0% +10.01% BM_eigen_sqrt_ctype/8 332ns ± 0% 50ns ± 1% -84.97% BM_eigen_sqrt_ctype/64 2.81µs ± 1% 0.38µs ± 0% -86.49% BM_eigen_sqrt_ctype/512 23.8µs ± 0% 3.0µs ± 0% -87.32% BM_eigen_sqrt_ctype/4k 202µs ± 0% 24µs ± 2% -88.03% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.19ms ± 0% -88.18% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 1.5ms ± 1% -88.20% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 6.2ms ± 0% -88.18% ``` AVX2: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.6ns ± 0% 55.6ns ± 0% +3.71% BM_eigen_sqrt_ctype/8 334ns ± 0% 27ns ± 0% -91.86% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.22µs ± 2% -92.28% BM_eigen_sqrt_ctype/512 23.8µs ± 1% 1.7µs ± 1% -92.81% BM_eigen_sqrt_ctype/4k 201µs ± 0% 14µs ± 1% -93.24% BM_eigen_sqrt_ctype/32k 1.62ms ± 0% 0.11ms ± 1% -93.29% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.9ms ± 1% -93.31% BM_eigen_sqrt_ctype/1M 52.0ms ± 0% 3.5ms ± 1% -93.31% ``` AVX512: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.7ns ± 0% 56.2ns ± 1% +4.75% BM_eigen_sqrt_ctype/8 334ns ± 0% 18ns ± 2% -94.63% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.12µs ± 1% -95.54% BM_eigen_sqrt_ctype/512 23.9µs ± 1% 1.0µs ± 1% -95.89% BM_eigen_sqrt_ctype/4k 202µs ± 0% 8µs ± 1% -96.13% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.06ms ± 1% -96.15% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.5ms ± 4% -96.11% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 2.0ms ± 1% -96.13% ```
2020-12-09 10:13:35 +08:00
// Test pcplxflip.
2011-02-23 21:20:33 +08:00
{
for (int i = 0; i < PacketSize; ++i) ref[i] = Scalar(std::imag(data1[i]), std::real(data1[i]));
internal::pstore(pval, internal::pcplxflip(internal::pload<Packet>(data1)));
VERIFY(test::areApprox(ref, pval, PacketSize) && "pcplxflip");
2011-02-23 21:20:33 +08:00
}
Implement vectorized complex square root. Closes #1905 Measured speedup for sqrt of `complex<float>` on Skylake: SSE: ``` name old time/op new time/op delta BM_eigen_sqrt_ctype/1 49.4ns ± 0% 54.3ns ± 0% +10.01% BM_eigen_sqrt_ctype/8 332ns ± 0% 50ns ± 1% -84.97% BM_eigen_sqrt_ctype/64 2.81µs ± 1% 0.38µs ± 0% -86.49% BM_eigen_sqrt_ctype/512 23.8µs ± 0% 3.0µs ± 0% -87.32% BM_eigen_sqrt_ctype/4k 202µs ± 0% 24µs ± 2% -88.03% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.19ms ± 0% -88.18% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 1.5ms ± 1% -88.20% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 6.2ms ± 0% -88.18% ``` AVX2: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.6ns ± 0% 55.6ns ± 0% +3.71% BM_eigen_sqrt_ctype/8 334ns ± 0% 27ns ± 0% -91.86% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.22µs ± 2% -92.28% BM_eigen_sqrt_ctype/512 23.8µs ± 1% 1.7µs ± 1% -92.81% BM_eigen_sqrt_ctype/4k 201µs ± 0% 14µs ± 1% -93.24% BM_eigen_sqrt_ctype/32k 1.62ms ± 0% 0.11ms ± 1% -93.29% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.9ms ± 1% -93.31% BM_eigen_sqrt_ctype/1M 52.0ms ± 0% 3.5ms ± 1% -93.31% ``` AVX512: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.7ns ± 0% 56.2ns ± 1% +4.75% BM_eigen_sqrt_ctype/8 334ns ± 0% 18ns ± 2% -94.63% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.12µs ± 1% -95.54% BM_eigen_sqrt_ctype/512 23.9µs ± 1% 1.0µs ± 1% -95.89% BM_eigen_sqrt_ctype/4k 202µs ± 0% 8µs ± 1% -96.13% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.06ms ± 1% -96.15% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.5ms ± 4% -96.11% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 2.0ms ± 1% -96.13% ```
2020-12-09 10:13:35 +08:00
if (PacketTraits::HasSqrt) {
for (int i = 0; i < size; ++i) {
data1[i] = Scalar(internal::random<RealScalar>(), internal::random<RealScalar>());
}
CHECK_CWISE1_N(numext::sqrt, internal::psqrt, size);
2022-08-10 03:54:57 +08:00
CHECK_CWISE1_IF(PacketTraits::HasSign, numext::sign, internal::psign);
Implement vectorized complex square root. Closes #1905 Measured speedup for sqrt of `complex<float>` on Skylake: SSE: ``` name old time/op new time/op delta BM_eigen_sqrt_ctype/1 49.4ns ± 0% 54.3ns ± 0% +10.01% BM_eigen_sqrt_ctype/8 332ns ± 0% 50ns ± 1% -84.97% BM_eigen_sqrt_ctype/64 2.81µs ± 1% 0.38µs ± 0% -86.49% BM_eigen_sqrt_ctype/512 23.8µs ± 0% 3.0µs ± 0% -87.32% BM_eigen_sqrt_ctype/4k 202µs ± 0% 24µs ± 2% -88.03% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.19ms ± 0% -88.18% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 1.5ms ± 1% -88.20% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 6.2ms ± 0% -88.18% ``` AVX2: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.6ns ± 0% 55.6ns ± 0% +3.71% BM_eigen_sqrt_ctype/8 334ns ± 0% 27ns ± 0% -91.86% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.22µs ± 2% -92.28% BM_eigen_sqrt_ctype/512 23.8µs ± 1% 1.7µs ± 1% -92.81% BM_eigen_sqrt_ctype/4k 201µs ± 0% 14µs ± 1% -93.24% BM_eigen_sqrt_ctype/32k 1.62ms ± 0% 0.11ms ± 1% -93.29% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.9ms ± 1% -93.31% BM_eigen_sqrt_ctype/1M 52.0ms ± 0% 3.5ms ± 1% -93.31% ``` AVX512: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.7ns ± 0% 56.2ns ± 1% +4.75% BM_eigen_sqrt_ctype/8 334ns ± 0% 18ns ± 2% -94.63% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.12µs ± 1% -95.54% BM_eigen_sqrt_ctype/512 23.9µs ± 1% 1.0µs ± 1% -95.89% BM_eigen_sqrt_ctype/4k 202µs ± 0% 8µs ± 1% -96.13% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.06ms ± 1% -96.15% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.5ms ± 4% -96.11% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 2.0ms ± 1% -96.13% ```
2020-12-09 10:13:35 +08:00
// Test misc. corner cases.
const RealScalar zero = RealScalar(0);
const RealScalar one = RealScalar(1);
const RealScalar inf = std::numeric_limits<RealScalar>::infinity();
const RealScalar nan = std::numeric_limits<RealScalar>::quiet_NaN();
data1[0] = Scalar(zero, zero);
data1[1] = Scalar(-zero, zero);
data1[2] = Scalar(one, zero);
data1[3] = Scalar(zero, one);
CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
Implement vectorized complex square root. Closes #1905 Measured speedup for sqrt of `complex<float>` on Skylake: SSE: ``` name old time/op new time/op delta BM_eigen_sqrt_ctype/1 49.4ns ± 0% 54.3ns ± 0% +10.01% BM_eigen_sqrt_ctype/8 332ns ± 0% 50ns ± 1% -84.97% BM_eigen_sqrt_ctype/64 2.81µs ± 1% 0.38µs ± 0% -86.49% BM_eigen_sqrt_ctype/512 23.8µs ± 0% 3.0µs ± 0% -87.32% BM_eigen_sqrt_ctype/4k 202µs ± 0% 24µs ± 2% -88.03% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.19ms ± 0% -88.18% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 1.5ms ± 1% -88.20% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 6.2ms ± 0% -88.18% ``` AVX2: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.6ns ± 0% 55.6ns ± 0% +3.71% BM_eigen_sqrt_ctype/8 334ns ± 0% 27ns ± 0% -91.86% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.22µs ± 2% -92.28% BM_eigen_sqrt_ctype/512 23.8µs ± 1% 1.7µs ± 1% -92.81% BM_eigen_sqrt_ctype/4k 201µs ± 0% 14µs ± 1% -93.24% BM_eigen_sqrt_ctype/32k 1.62ms ± 0% 0.11ms ± 1% -93.29% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.9ms ± 1% -93.31% BM_eigen_sqrt_ctype/1M 52.0ms ± 0% 3.5ms ± 1% -93.31% ``` AVX512: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.7ns ± 0% 56.2ns ± 1% +4.75% BM_eigen_sqrt_ctype/8 334ns ± 0% 18ns ± 2% -94.63% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.12µs ± 1% -95.54% BM_eigen_sqrt_ctype/512 23.9µs ± 1% 1.0µs ± 1% -95.89% BM_eigen_sqrt_ctype/4k 202µs ± 0% 8µs ± 1% -96.13% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.06ms ± 1% -96.15% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.5ms ± 4% -96.11% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 2.0ms ± 1% -96.13% ```
2020-12-09 10:13:35 +08:00
data1[0] = Scalar(-one, zero);
data1[1] = Scalar(zero, -one);
data1[2] = Scalar(one, one);
data1[3] = Scalar(-one, -one);
CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
Implement vectorized complex square root. Closes #1905 Measured speedup for sqrt of `complex<float>` on Skylake: SSE: ``` name old time/op new time/op delta BM_eigen_sqrt_ctype/1 49.4ns ± 0% 54.3ns ± 0% +10.01% BM_eigen_sqrt_ctype/8 332ns ± 0% 50ns ± 1% -84.97% BM_eigen_sqrt_ctype/64 2.81µs ± 1% 0.38µs ± 0% -86.49% BM_eigen_sqrt_ctype/512 23.8µs ± 0% 3.0µs ± 0% -87.32% BM_eigen_sqrt_ctype/4k 202µs ± 0% 24µs ± 2% -88.03% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.19ms ± 0% -88.18% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 1.5ms ± 1% -88.20% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 6.2ms ± 0% -88.18% ``` AVX2: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.6ns ± 0% 55.6ns ± 0% +3.71% BM_eigen_sqrt_ctype/8 334ns ± 0% 27ns ± 0% -91.86% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.22µs ± 2% -92.28% BM_eigen_sqrt_ctype/512 23.8µs ± 1% 1.7µs ± 1% -92.81% BM_eigen_sqrt_ctype/4k 201µs ± 0% 14µs ± 1% -93.24% BM_eigen_sqrt_ctype/32k 1.62ms ± 0% 0.11ms ± 1% -93.29% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.9ms ± 1% -93.31% BM_eigen_sqrt_ctype/1M 52.0ms ± 0% 3.5ms ± 1% -93.31% ``` AVX512: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.7ns ± 0% 56.2ns ± 1% +4.75% BM_eigen_sqrt_ctype/8 334ns ± 0% 18ns ± 2% -94.63% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.12µs ± 1% -95.54% BM_eigen_sqrt_ctype/512 23.9µs ± 1% 1.0µs ± 1% -95.89% BM_eigen_sqrt_ctype/4k 202µs ± 0% 8µs ± 1% -96.13% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.06ms ± 1% -96.15% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.5ms ± 4% -96.11% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 2.0ms ± 1% -96.13% ```
2020-12-09 10:13:35 +08:00
data1[0] = Scalar(inf, zero);
data1[1] = Scalar(zero, inf);
data1[2] = Scalar(-inf, zero);
data1[3] = Scalar(zero, -inf);
CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
Implement vectorized complex square root. Closes #1905 Measured speedup for sqrt of `complex<float>` on Skylake: SSE: ``` name old time/op new time/op delta BM_eigen_sqrt_ctype/1 49.4ns ± 0% 54.3ns ± 0% +10.01% BM_eigen_sqrt_ctype/8 332ns ± 0% 50ns ± 1% -84.97% BM_eigen_sqrt_ctype/64 2.81µs ± 1% 0.38µs ± 0% -86.49% BM_eigen_sqrt_ctype/512 23.8µs ± 0% 3.0µs ± 0% -87.32% BM_eigen_sqrt_ctype/4k 202µs ± 0% 24µs ± 2% -88.03% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.19ms ± 0% -88.18% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 1.5ms ± 1% -88.20% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 6.2ms ± 0% -88.18% ``` AVX2: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.6ns ± 0% 55.6ns ± 0% +3.71% BM_eigen_sqrt_ctype/8 334ns ± 0% 27ns ± 0% -91.86% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.22µs ± 2% -92.28% BM_eigen_sqrt_ctype/512 23.8µs ± 1% 1.7µs ± 1% -92.81% BM_eigen_sqrt_ctype/4k 201µs ± 0% 14µs ± 1% -93.24% BM_eigen_sqrt_ctype/32k 1.62ms ± 0% 0.11ms ± 1% -93.29% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.9ms ± 1% -93.31% BM_eigen_sqrt_ctype/1M 52.0ms ± 0% 3.5ms ± 1% -93.31% ``` AVX512: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.7ns ± 0% 56.2ns ± 1% +4.75% BM_eigen_sqrt_ctype/8 334ns ± 0% 18ns ± 2% -94.63% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.12µs ± 1% -95.54% BM_eigen_sqrt_ctype/512 23.9µs ± 1% 1.0µs ± 1% -95.89% BM_eigen_sqrt_ctype/4k 202µs ± 0% 8µs ± 1% -96.13% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.06ms ± 1% -96.15% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.5ms ± 4% -96.11% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 2.0ms ± 1% -96.13% ```
2020-12-09 10:13:35 +08:00
data1[0] = Scalar(inf, inf);
data1[1] = Scalar(-inf, inf);
data1[2] = Scalar(inf, -inf);
data1[3] = Scalar(-inf, -inf);
CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
Implement vectorized complex square root. Closes #1905 Measured speedup for sqrt of `complex<float>` on Skylake: SSE: ``` name old time/op new time/op delta BM_eigen_sqrt_ctype/1 49.4ns ± 0% 54.3ns ± 0% +10.01% BM_eigen_sqrt_ctype/8 332ns ± 0% 50ns ± 1% -84.97% BM_eigen_sqrt_ctype/64 2.81µs ± 1% 0.38µs ± 0% -86.49% BM_eigen_sqrt_ctype/512 23.8µs ± 0% 3.0µs ± 0% -87.32% BM_eigen_sqrt_ctype/4k 202µs ± 0% 24µs ± 2% -88.03% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.19ms ± 0% -88.18% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 1.5ms ± 1% -88.20% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 6.2ms ± 0% -88.18% ``` AVX2: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.6ns ± 0% 55.6ns ± 0% +3.71% BM_eigen_sqrt_ctype/8 334ns ± 0% 27ns ± 0% -91.86% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.22µs ± 2% -92.28% BM_eigen_sqrt_ctype/512 23.8µs ± 1% 1.7µs ± 1% -92.81% BM_eigen_sqrt_ctype/4k 201µs ± 0% 14µs ± 1% -93.24% BM_eigen_sqrt_ctype/32k 1.62ms ± 0% 0.11ms ± 1% -93.29% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.9ms ± 1% -93.31% BM_eigen_sqrt_ctype/1M 52.0ms ± 0% 3.5ms ± 1% -93.31% ``` AVX512: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.7ns ± 0% 56.2ns ± 1% +4.75% BM_eigen_sqrt_ctype/8 334ns ± 0% 18ns ± 2% -94.63% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.12µs ± 1% -95.54% BM_eigen_sqrt_ctype/512 23.9µs ± 1% 1.0µs ± 1% -95.89% BM_eigen_sqrt_ctype/4k 202µs ± 0% 8µs ± 1% -96.13% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.06ms ± 1% -96.15% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.5ms ± 4% -96.11% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 2.0ms ± 1% -96.13% ```
2020-12-09 10:13:35 +08:00
data1[0] = Scalar(nan, zero);
data1[1] = Scalar(zero, nan);
data1[2] = Scalar(nan, one);
data1[3] = Scalar(one, nan);
CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
Implement vectorized complex square root. Closes #1905 Measured speedup for sqrt of `complex<float>` on Skylake: SSE: ``` name old time/op new time/op delta BM_eigen_sqrt_ctype/1 49.4ns ± 0% 54.3ns ± 0% +10.01% BM_eigen_sqrt_ctype/8 332ns ± 0% 50ns ± 1% -84.97% BM_eigen_sqrt_ctype/64 2.81µs ± 1% 0.38µs ± 0% -86.49% BM_eigen_sqrt_ctype/512 23.8µs ± 0% 3.0µs ± 0% -87.32% BM_eigen_sqrt_ctype/4k 202µs ± 0% 24µs ± 2% -88.03% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.19ms ± 0% -88.18% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 1.5ms ± 1% -88.20% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 6.2ms ± 0% -88.18% ``` AVX2: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.6ns ± 0% 55.6ns ± 0% +3.71% BM_eigen_sqrt_ctype/8 334ns ± 0% 27ns ± 0% -91.86% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.22µs ± 2% -92.28% BM_eigen_sqrt_ctype/512 23.8µs ± 1% 1.7µs ± 1% -92.81% BM_eigen_sqrt_ctype/4k 201µs ± 0% 14µs ± 1% -93.24% BM_eigen_sqrt_ctype/32k 1.62ms ± 0% 0.11ms ± 1% -93.29% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.9ms ± 1% -93.31% BM_eigen_sqrt_ctype/1M 52.0ms ± 0% 3.5ms ± 1% -93.31% ``` AVX512: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.7ns ± 0% 56.2ns ± 1% +4.75% BM_eigen_sqrt_ctype/8 334ns ± 0% 18ns ± 2% -94.63% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.12µs ± 1% -95.54% BM_eigen_sqrt_ctype/512 23.9µs ± 1% 1.0µs ± 1% -95.89% BM_eigen_sqrt_ctype/4k 202µs ± 0% 8µs ± 1% -96.13% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.06ms ± 1% -96.15% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.5ms ± 4% -96.11% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 2.0ms ± 1% -96.13% ```
2020-12-09 10:13:35 +08:00
data1[0] = Scalar(nan, nan);
data1[1] = Scalar(inf, nan);
data1[2] = Scalar(nan, inf);
data1[3] = Scalar(-inf, nan);
CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
Implement vectorized complex square root. Closes #1905 Measured speedup for sqrt of `complex<float>` on Skylake: SSE: ``` name old time/op new time/op delta BM_eigen_sqrt_ctype/1 49.4ns ± 0% 54.3ns ± 0% +10.01% BM_eigen_sqrt_ctype/8 332ns ± 0% 50ns ± 1% -84.97% BM_eigen_sqrt_ctype/64 2.81µs ± 1% 0.38µs ± 0% -86.49% BM_eigen_sqrt_ctype/512 23.8µs ± 0% 3.0µs ± 0% -87.32% BM_eigen_sqrt_ctype/4k 202µs ± 0% 24µs ± 2% -88.03% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.19ms ± 0% -88.18% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 1.5ms ± 1% -88.20% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 6.2ms ± 0% -88.18% ``` AVX2: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.6ns ± 0% 55.6ns ± 0% +3.71% BM_eigen_sqrt_ctype/8 334ns ± 0% 27ns ± 0% -91.86% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.22µs ± 2% -92.28% BM_eigen_sqrt_ctype/512 23.8µs ± 1% 1.7µs ± 1% -92.81% BM_eigen_sqrt_ctype/4k 201µs ± 0% 14µs ± 1% -93.24% BM_eigen_sqrt_ctype/32k 1.62ms ± 0% 0.11ms ± 1% -93.29% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.9ms ± 1% -93.31% BM_eigen_sqrt_ctype/1M 52.0ms ± 0% 3.5ms ± 1% -93.31% ``` AVX512: ``` name old cpu/op new cpu/op delta BM_eigen_sqrt_ctype/1 53.7ns ± 0% 56.2ns ± 1% +4.75% BM_eigen_sqrt_ctype/8 334ns ± 0% 18ns ± 2% -94.63% BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.12µs ± 1% -95.54% BM_eigen_sqrt_ctype/512 23.9µs ± 1% 1.0µs ± 1% -95.89% BM_eigen_sqrt_ctype/4k 202µs ± 0% 8µs ± 1% -96.13% BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.06ms ± 1% -96.15% BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.5ms ± 4% -96.11% BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 2.0ms ± 1% -96.13% ```
2020-12-09 10:13:35 +08:00
}
2023-12-07 20:41:09 +08:00
if (PacketTraits::HasLog) {
for (int i = 0; i < size; ++i) {
data1[i] = Scalar(internal::random<RealScalar>(), internal::random<RealScalar>());
}
CHECK_CWISE1_N(std::log, internal::plog, size);
// Test misc. corner cases.
const RealScalar zero = RealScalar(0);
const RealScalar one = RealScalar(1);
const RealScalar inf = std::numeric_limits<RealScalar>::infinity();
const RealScalar nan = std::numeric_limits<RealScalar>::quiet_NaN();
for (RealScalar x : {zero, one, inf}) {
for (RealScalar y : {zero, one, inf}) {
data1[0] = Scalar(x, y);
data1[1] = Scalar(-x, y);
data1[2] = Scalar(x, -y);
data1[3] = Scalar(-x, -y);
CHECK_CWISE1_IM1ULP_N(std::log, internal::plog, 4);
}
}
// Set reference results to nan.
// Some architectures don't handle IEEE edge cases correctly
ref[0] = Scalar(nan, nan);
ref[1] = Scalar(nan, nan);
ref[2] = Scalar(nan, nan);
ref[3] = Scalar(nan, nan);
for (RealScalar x : {zero, one}) {
data1[0] = Scalar(x, nan);
data1[1] = Scalar(-x, nan);
data1[2] = Scalar(nan, x);
data1[3] = Scalar(nan, -x);
for (int j = 0; j < size; j += PacketSize)
internal::pstore(data2 + j, internal::plog(internal::pload<Packet>(data1 + j)));
VERIFY(test::areApprox(ref, data2, 4));
}
data1[0] = Scalar(inf, nan);
data1[1] = Scalar(-inf, nan);
data1[2] = Scalar(nan, inf);
data1[3] = Scalar(nan, -inf);
2024-03-07 07:51:47 +08:00
CHECK_CWISE1_IM1ULP_N(numext::log, internal::plog, 4);
2023-12-07 20:41:09 +08:00
}
2024-02-17 11:08:23 +08:00
exp_complex_test<Scalar, Packet>(data1, data2, ref, size);
}
template <typename Scalar, typename Packet>
void packetmath_scatter_gather() {
typedef typename NumTraits<Scalar>::Real RealScalar;
const int PacketSize = internal::unpacket_traits<Packet>::size;
EIGEN_ALIGN_MAX Scalar data1[PacketSize];
RealScalar refvalue = RealScalar(0);
for (int i = 0; i < PacketSize; ++i) {
data1[i] = internal::random<Scalar>();
}
int stride = internal::random<int>(1, 20);
// Buffer of zeros.
EIGEN_ALIGN_MAX Scalar buffer[PacketSize * 20] = {};
Packet packet = internal::pload<Packet>(data1);
2014-07-09 22:01:24 +08:00
internal::pscatter<Scalar, Packet>(buffer, packet, stride);
for (int i = 0; i < PacketSize * 20; ++i) {
if ((i % stride) == 0 && i < stride * PacketSize) {
VERIFY(test::isApproxAbs(buffer[i], data1[i / stride], refvalue) && "pscatter");
} else {
VERIFY(test::isApproxAbs(buffer[i], Scalar(0), refvalue) && "pscatter");
}
}
for (int i = 0; i < PacketSize * 7; ++i) {
buffer[i] = internal::random<Scalar>();
}
packet = internal::pgather<Scalar, Packet>(buffer, 7);
internal::pstore(data1, packet);
for (int i = 0; i < PacketSize; ++i) {
VERIFY(test::isApproxAbs(data1[i], buffer[i * 7], refvalue) && "pgather");
}
for (Index N = 0; N <= PacketSize; ++N) {
for (Index i = 0; i < N; ++i) {
data1[i] = internal::random<Scalar>();
}
for (Index i = 0; i < N * 20; ++i) {
buffer[i] = Scalar(0);
}
packet = internal::pload_partial<Packet>(data1, N);
internal::pscatter_partial<Scalar, Packet>(buffer, packet, stride, N);
for (Index i = 0; i < N * 20; ++i) {
if ((i % stride) == 0 && i < stride * N) {
VERIFY(test::isApproxAbs(buffer[i], data1[i / stride], refvalue) && "pscatter_partial");
} else {
VERIFY(test::isApproxAbs(buffer[i], Scalar(0), refvalue) && "pscatter_partial");
}
}
for (Index i = 0; i < N * 7; ++i) {
buffer[i] = internal::random<Scalar>();
}
packet = internal::pgather_partial<Scalar, Packet>(buffer, 7, N);
internal::pstore_partial(data1, packet, N);
for (Index i = 0; i < N; ++i) {
VERIFY(test::isApproxAbs(data1[i], buffer[i * 7], refvalue) && "pgather_partial");
}
}
}
namespace Eigen {
namespace test {
template <typename Scalar, typename PacketType>
struct runall<Scalar, PacketType, false, false> { // i.e. float or double
static void run() {
packetmath<Scalar, PacketType>();
packetmath_scatter_gather<Scalar, PacketType>();
packetmath_notcomplex<Scalar, PacketType>();
packetmath_real<Scalar, PacketType>();
}
};
template <typename Scalar, typename PacketType>
struct runall<Scalar, PacketType, false, true> { // i.e. int
static void run() {
packetmath<Scalar, PacketType>();
packetmath_scatter_gather<Scalar, PacketType>();
packetmath_notcomplex<Scalar, PacketType>();
}
};
template <typename Scalar, typename PacketType>
struct runall<Scalar, PacketType, true, false> { // i.e. complex
static void run() {
packetmath<Scalar, PacketType>();
packetmath_scatter_gather<Scalar, PacketType>();
packetmath_complex<Scalar, PacketType>();
}
};
} // namespace test
} // namespace Eigen
EIGEN_DECLARE_TEST(packetmath) {
g_first_pass = true;
for (int i = 0; i < g_repeat; i++) {
CALL_SUBTEST_1(test::runner<float>::run());
CALL_SUBTEST_2(test::runner<double>::run());
CALL_SUBTEST_3(test::runner<int8_t>::run());
CALL_SUBTEST_4(test::runner<uint8_t>::run());
CALL_SUBTEST_5(test::runner<int16_t>::run());
CALL_SUBTEST_6(test::runner<uint16_t>::run());
CALL_SUBTEST_7(test::runner<int32_t>::run());
CALL_SUBTEST_8(test::runner<uint32_t>::run());
CALL_SUBTEST_9(test::runner<int64_t>::run());
CALL_SUBTEST_10(test::runner<uint64_t>::run());
CALL_SUBTEST_11(test::runner<std::complex<float>>::run());
CALL_SUBTEST_12(test::runner<std::complex<double>>::run());
CALL_SUBTEST_13(test::runner<half>::run());
CALL_SUBTEST_14((packetmath<bool, internal::packet_traits<bool>::type>()));
CALL_SUBTEST_14((packetmath_scatter_gather<bool, internal::packet_traits<bool>::type>()));
CALL_SUBTEST_15(test::runner<bfloat16>::run());
g_first_pass = false;
}
}