File libs/mesh/cube.h changed (mode: 100644) (index 3f02c65..2f316a5) |
3 |
3 |
#include "base.h" |
#include "base.h" |
4 |
4 |
#include <math/vector.h> |
#include <math/vector.h> |
5 |
5 |
#include <math/matrix.h> |
#include <math/matrix.h> |
6 |
|
#include <simd/simd.h> |
|
|
6 |
|
#include <simdcpp/simd.h> |
7 |
7 |
#include <limits> |
#include <limits> |
8 |
8 |
|
|
9 |
9 |
#define CONDITION_ARGSF bool texture_coord, bool normals |
#define CONDITION_ARGSF bool texture_coord, bool normals |
|
... |
... |
namespace mesh::cube |
71 |
71 |
|
|
72 |
72 |
namespace mesh::cube |
namespace mesh::cube |
73 |
73 |
{ |
{ |
74 |
|
template<typename float_t> [[nodiscard]] constexpr inline |
|
|
74 |
|
template<typename float_t> [[nodiscard]] constexpr |
75 |
75 |
math::vec3<float_t> face_coordinate_to_real(Face face, float_t x, float_t y) { |
math::vec3<float_t> face_coordinate_to_real(Face face, float_t x, float_t y) { |
76 |
76 |
using vec3 = math::vec3<float_t>; |
using vec3 = math::vec3<float_t>; |
77 |
77 |
switch (face) { |
switch (face) { |
|
... |
... |
namespace mesh::cube |
85 |
85 |
} |
} |
86 |
86 |
} |
} |
87 |
87 |
|
|
88 |
|
template<typename T> |
|
89 |
|
[[nodiscard]] constexpr inline Face real_coordinate_to_face(T x, T y, T z) { |
|
|
88 |
|
template<typename T> [[nodiscard]] constexpr |
|
89 |
|
Face real_coordinate_to_face(T x, T y, T z) { |
90 |
90 |
const Face faceX = x >= 0 ? Face::EAST : Face::WEST; |
const Face faceX = x >= 0 ? Face::EAST : Face::WEST; |
91 |
91 |
const Face faceY = y >= 0 ? Face::BOTTOM: Face::TOP; |
const Face faceY = y >= 0 ? Face::BOTTOM: Face::TOP; |
92 |
92 |
const Face faceZ = z >= 0 ? Face::NORTH : Face::SOUTH; |
const Face faceZ = z >= 0 ? Face::NORTH : Face::SOUTH; |
|
... |
... |
namespace mesh::cube |
100 |
100 |
return abs_z > abs_y ? faceZ : faceY; |
return abs_z > abs_y ? faceZ : faceY; |
101 |
101 |
} |
} |
102 |
102 |
|
|
103 |
|
template<simd::Level level> [[nodiscard]] constexpr inline |
|
104 |
|
simd::pint_simd<level> real_coordinate_to_face_simd( |
|
105 |
|
simd::pint_simd<level> x, |
|
106 |
|
simd::pint_simd<level> y, |
|
107 |
|
simd::pint_simd<level> z) |
|
108 |
|
{ |
|
109 |
|
using pi = simd::pint_simd<level>; |
|
110 |
|
|
|
|
103 |
|
template<simd::Level L> [[nodiscard]] constexpr |
|
104 |
|
simd::pint<L> real_coordinate_to_face_simd( |
|
105 |
|
simd::pint<L> x, simd::pint<L> y, simd::pint<L> z) { |
|
106 |
|
using pi = simd::pint<L>; |
111 |
107 |
const pi zero = 0; |
const pi zero = 0; |
112 |
|
|
|
113 |
|
const auto faceX = (x < zero).blendv(pi(EAST ), pi(WEST )); |
|
114 |
|
const auto faceY = (y < zero).blendv(pi(BOTTOM), pi(TOP )); |
|
115 |
|
const auto faceZ = (z < zero).blendv(pi(NORTH ), pi(SOUTH)); |
|
116 |
|
|
|
|
108 |
|
const auto faceX = (x < zero).blend(pi(WEST ), pi(EAST )); |
|
109 |
|
const auto faceY = (y < zero).blend(pi(TOP ), pi(BOTTOM)); |
|
110 |
|
const auto faceZ = (z < zero).blend(pi(SOUTH), pi(NORTH )); |
117 |
111 |
const auto abs_x = x.abs(); |
const auto abs_x = x.abs(); |
118 |
112 |
const auto abs_y = y.abs(); |
const auto abs_y = y.abs(); |
119 |
113 |
const auto abs_z = z.abs(); |
const auto abs_z = z.abs(); |
120 |
|
|
|
121 |
|
const auto faceZX = (abs_z > abs_x).blendv(faceX,faceZ); |
|
122 |
|
const auto faceZY = (abs_z > abs_y).blendv(faceY,faceZ); |
|
123 |
|
return (abs_x > abs_y).blendv(faceZY,faceZX); |
|
|
114 |
|
const auto faceZX = (abs_z > abs_x).blend(faceZ,faceX); |
|
115 |
|
const auto faceZY = (abs_z > abs_y).blend(faceZ,faceY); |
|
116 |
|
return (abs_x > abs_y).blend(faceZX,faceZY); |
124 |
117 |
} |
} |
125 |
118 |
|
|
126 |
|
template<typename T> constexpr inline |
|
|
119 |
|
template<typename T> constexpr |
127 |
120 |
math::matrix<3, 3, T> rotation(Face face) { |
math::matrix<3, 3, T> rotation(Face face) { |
128 |
121 |
if (face == 0) |
if (face == 0) |
129 |
122 |
return math::scale<3>(1); |
return math::scale<3>(1); |
File libs/mesh/polyhedron/icosahedron_quad_tesselated.h changed (mode: 100644) (index 0a2bf24..7fc143d) |
1 |
1 |
#pragma once |
#pragma once |
2 |
2 |
#include "icosahedron_quad.h" |
#include "icosahedron_quad.h" |
3 |
3 |
|
|
4 |
|
#include <simd/simd.h> |
|
|
4 |
|
#include <simdcpp/simd.h> |
5 |
5 |
#include <templates/tree/red_black_key.h> |
#include <templates/tree/red_black_key.h> |
6 |
6 |
|
|
7 |
7 |
#define TEMPLATE_ARGS template<bool texture_coord, bool normals, typename floating_t, typename index_t> |
#define TEMPLATE_ARGS template<bool texture_coord, bool normals, typename floating_t, typename index_t> |
|
... |
... |
namespace mesh::icosahedron_quad |
46 |
46 |
return normalize(quads<T>[quad_i].vertices[0] + dH + dD + dUP); |
return normalize(quads<T>[quad_i].vertices[0] + dH + dD + dUP); |
47 |
47 |
} |
} |
48 |
48 |
|
|
49 |
|
template<simd::Level l> [[nodiscard]] inline |
|
50 |
|
math::vec3<simd::pfloat_simd<l>> quad_coordinate_to_real( |
|
51 |
|
unsigned int quad_i, simd::pfloat_simd<l> u, simd::pfloat_simd<l> v) |
|
|
49 |
|
template<simd::Level l> [[nodiscard]] |
|
50 |
|
math::vec3<simd::pfloat<l>> quad_coordinate_to_real( |
|
51 |
|
unsigned int quad_i, simd::pfloat<l> u, simd::pfloat<l> v) |
52 |
52 |
{ |
{ |
53 |
53 |
using namespace math; |
using namespace math; |
54 |
54 |
using namespace simd; |
using namespace simd; |
55 |
|
using pf = simd::pfloat_simd<l>; |
|
|
55 |
|
using pf = simd::pfloat<l>; |
56 |
56 |
|
|
57 |
57 |
pf diagonal_ratio_x2 = (u + v); |
pf diagonal_ratio_x2 = (u + v); |
58 |
58 |
pf horizontal_ratio = (v - u) / pf(2.f); |
pf horizontal_ratio = (v - u) / pf(2.f); |
File libs/simd/def.h deleted (index af786ef..0000000) |
1 |
|
#pragma once |
|
2 |
|
|
|
3 |
|
#include <cstdint> |
|
4 |
|
#include <type_traits> |
|
5 |
|
#include <cstddef> |
|
6 |
|
#include <mm_malloc.h> |
|
7 |
|
#include <limits> |
|
8 |
|
|
|
9 |
|
namespace simd |
|
10 |
|
{ |
|
11 |
|
enum Level |
|
12 |
|
{ |
|
13 |
|
NO_SIMD, |
|
14 |
|
SSE2, SSE4_1, SSE4_2, |
|
15 |
|
AVX2, AVX512, MAX = AVX512 |
|
16 |
|
}; |
|
17 |
|
|
|
18 |
|
enum Alignment |
|
19 |
|
{ |
|
20 |
|
NO_SIMD_ALIGNMENT = 4, |
|
21 |
|
SSE2_ALIGNMENT = 16, SSE4_1_ALIGNMENT = 16, SSE4_2_ALIGNMENT = 16, |
|
22 |
|
#ifdef __AVX2__ |
|
23 |
|
AVX2_ALIGNMENT = 32, AVX512_ALIGNMENT = 32 |
|
24 |
|
#else |
|
25 |
|
AVX2_ALIGNMENT = NO_SIMD_ALIGNMENT, AVX512_ALIGNMENT = NO_SIMD_ALIGNMENT |
|
26 |
|
#endif |
|
27 |
|
}; |
|
28 |
|
|
|
29 |
|
|
|
30 |
|
Level level(); |
|
31 |
|
|
|
32 |
|
inline Level LEVEL() |
|
33 |
|
{ |
|
34 |
|
static Level l = level(); |
|
35 |
|
#ifdef __MINGW64__ //FIXME mingw does something wrong |
|
36 |
|
if (l >= simd::Level::AVX2) |
|
37 |
|
l = SSE4_2; |
|
38 |
|
#endif |
|
39 |
|
return l; |
|
40 |
|
} |
|
41 |
|
|
|
42 |
|
template<typename MASK, int alignment> |
|
43 |
|
struct mask; |
|
44 |
|
|
|
45 |
|
template<typename MASK, int alignment> |
|
46 |
|
struct get_mask |
|
47 |
|
{ |
|
48 |
|
using T = mask<int32_t, alignment>; |
|
49 |
|
}; |
|
50 |
|
|
|
51 |
|
template<typename PACKED, typename single, int alignment> |
|
52 |
|
struct alignas(alignment) packed |
|
53 |
|
{ |
|
54 |
|
constexpr static const unsigned int COUNT = sizeof (PACKED) / sizeof (single); |
|
55 |
|
|
|
56 |
|
using P = PACKED; |
|
57 |
|
|
|
58 |
|
inline packed(); |
|
59 |
|
inline packed(const single&); |
|
60 |
|
inline explicit packed(const single*); |
|
61 |
|
|
|
62 |
|
template<typename U = PACKED> |
|
63 |
|
packed(const PACKED &p, typename std::enable_if<not std::is_same<U, single>::value>::type* = nullptr) : _(p) {} |
|
64 |
|
|
|
65 |
|
[[nodiscard]] inline void *operator new(size_t size) { return _mm_malloc(size, alignment); } |
|
66 |
|
|
|
67 |
|
void operator delete (void *p) { _mm_free(p); } |
|
68 |
|
|
|
69 |
|
inline void extract_u(single *dst) const; |
|
70 |
|
inline void extract (single *dst) const { extract_u(dst); } |
|
71 |
|
|
|
72 |
|
template<typename P_O, typename single_other> |
|
73 |
|
[[nodiscard]] inline static packed convert(const packed<P_O, single_other, alignment>&); |
|
74 |
|
|
|
75 |
|
template<typename P_O, typename single_other> |
|
76 |
|
[[nodiscard]] inline static packed cast (const packed<P_O, single_other, alignment>&); |
|
77 |
|
|
|
78 |
|
[[nodiscard]] inline packed operator ~ () const; |
|
79 |
|
[[nodiscard]] inline packed operator & (const packed&) const; |
|
80 |
|
[[nodiscard]] inline packed operator ^ (const packed&) const; |
|
81 |
|
[[nodiscard]] inline packed operator | (const packed&) const; |
|
82 |
|
|
|
83 |
|
[[nodiscard]] inline packed and_not (const packed&) const; |
|
84 |
|
|
|
85 |
|
[[nodiscard]] inline packed operator + (const packed&) const; |
|
86 |
|
[[nodiscard]] inline packed operator - (const packed&) const; |
|
87 |
|
[[nodiscard]] inline packed operator * (const packed&) const; |
|
88 |
|
[[nodiscard]] inline packed operator / (const packed&) const; |
|
89 |
|
|
|
90 |
|
inline packed& operator += (const packed &p) { *this = *this + p; return *this; } |
|
91 |
|
inline packed& operator -= (const packed &p) { *this = *this - p; return *this; } |
|
92 |
|
inline packed& operator *= (const packed &p) { *this = *this * p; return *this; } |
|
93 |
|
inline packed& operator /= (const packed &p) { *this = *this / p; return *this; } |
|
94 |
|
|
|
95 |
|
template<typename int_t> |
|
96 |
|
[[nodiscard]] inline packed operator << (int_t) const; |
|
97 |
|
template<typename int_t> |
|
98 |
|
[[nodiscard]] inline packed operator >> (int_t) const; |
|
99 |
|
|
|
100 |
|
//auto = mask |
|
101 |
|
[[nodiscard]] inline typename get_mask<PACKED, alignment>::T operator == (const packed&) const; |
|
102 |
|
[[nodiscard]] inline typename get_mask<PACKED, alignment>::T operator <= (const packed&) const; |
|
103 |
|
[[nodiscard]] inline typename get_mask<PACKED, alignment>::T operator >= (const packed&) const; |
|
104 |
|
[[nodiscard]] inline typename get_mask<PACKED, alignment>::T operator < (const packed&) const; |
|
105 |
|
[[nodiscard]] inline typename get_mask<PACKED, alignment>::T operator > (const packed&) const; |
|
106 |
|
|
|
107 |
|
[[nodiscard]] static inline packed mul_add (const packed&, const packed&, const packed&); |
|
108 |
|
[[nodiscard]] static inline packed mul_sub (const packed&, const packed&, const packed&); |
|
109 |
|
[[nodiscard]] static inline packed nmul_add (const packed&, const packed&, const packed&); |
|
110 |
|
|
|
111 |
|
[[nodiscard]] inline packed mul_add (const packed &a, const packed &b) const { return mul_add(a,b,*this); } |
|
112 |
|
[[nodiscard]] inline packed mul_sub (const packed &a, const packed &b) const { return mul_sub(a,b,*this); } |
|
113 |
|
[[nodiscard]] inline packed nmul_add (const packed &a, const packed &b) const { return nmul_add(a,b,*this); } |
|
114 |
|
|
|
115 |
|
[[nodiscard]] inline packed floor() const; |
|
116 |
|
[[nodiscard]] inline packed rsqrt() const; |
|
117 |
|
|
|
118 |
|
[[nodiscard]] inline packed abs() const; |
|
119 |
|
|
|
120 |
|
[[nodiscard]] inline single row_max() const |
|
121 |
|
{ |
|
122 |
|
//TODO |
|
123 |
|
|
|
124 |
|
if (COUNT == 1) |
|
125 |
|
{ |
|
126 |
|
float max; |
|
127 |
|
extract(&max); |
|
128 |
|
return max; |
|
129 |
|
} |
|
130 |
|
|
|
131 |
|
float v[COUNT]; |
|
132 |
|
extract(v); |
|
133 |
|
for (unsigned int i = 1; i < COUNT; ++i) |
|
134 |
|
if (v[0] < v[i]) |
|
135 |
|
v[0] = v[i]; |
|
136 |
|
|
|
137 |
|
return v[0]; |
|
138 |
|
} |
|
139 |
|
|
|
140 |
|
[[nodiscard]] inline single row_min() const |
|
141 |
|
{ |
|
142 |
|
//TODO |
|
143 |
|
|
|
144 |
|
if (COUNT == 1) |
|
145 |
|
{ |
|
146 |
|
float min; |
|
147 |
|
extract(&min); |
|
148 |
|
return min; |
|
149 |
|
} |
|
150 |
|
|
|
151 |
|
float v[COUNT]; |
|
152 |
|
extract(v); |
|
153 |
|
for (unsigned int i = 1; i < COUNT; ++i) |
|
154 |
|
if (v[0] > v[i]) |
|
155 |
|
v[0] = v[i]; |
|
156 |
|
|
|
157 |
|
return v[0]; |
|
158 |
|
} |
|
159 |
|
|
|
160 |
|
|
|
161 |
|
template<typename PACKED_INT> |
|
162 |
|
[[nodiscard]] static inline packed permute(const packed &a, const PACKED_INT &i); |
|
163 |
|
|
|
164 |
|
[[nodiscard]] static inline packed hash(const packed &seed, const packed &x, const packed &y, const packed &z) |
|
165 |
|
{ |
|
166 |
|
packed hash = z ^ (y ^ (x ^ seed)); |
|
167 |
|
hash = (hash * hash * packed(60493)) * hash; |
|
168 |
|
return (hash >> 13) ^ hash; |
|
169 |
|
} |
|
170 |
|
|
|
171 |
|
PACKED _; |
|
172 |
|
} __attribute__((aligned(alignment))); |
|
173 |
|
|
|
174 |
|
template<typename PACKED_INT, int alignment> using pint = simd::packed<PACKED_INT, int32_t, alignment>; |
|
175 |
|
template<typename PACKED_FLOAT, int alignment> using pfloat = simd::packed<PACKED_FLOAT, float, alignment>; |
|
176 |
|
|
|
177 |
|
template<typename MASK, int alignment> |
|
178 |
|
struct alignas(alignment) mask |
|
179 |
|
{ |
|
180 |
|
using pint = packed<MASK, int32_t, alignment>; |
|
181 |
|
|
|
182 |
|
inline mask(const MASK& m) : _(m) {} |
|
183 |
|
inline mask(const pint &i) : _(i._) {} |
|
184 |
|
|
|
185 |
|
[[nodiscard]] inline mask operator & (const mask &m) const { return pint(_) & pint(m._); } |
|
186 |
|
[[nodiscard]] inline mask operator | (const mask &m) const { return pint(_) | pint(m._); } |
|
187 |
|
[[nodiscard]] inline mask and_not (const mask &m) const { return pint(_).and_not(pint(m._)); } |
|
188 |
|
|
|
189 |
|
inline mask operator ~ () const { return ~pint(_); } |
|
190 |
|
|
|
191 |
|
|
|
192 |
|
template<typename P_T, typename S_T> |
|
193 |
|
[[nodiscard]] inline packed<P_T, S_T, alignment> operator & (const packed<P_T,S_T, alignment> &p) const; |
|
194 |
|
|
|
195 |
|
template<typename PACKED> |
|
196 |
|
using pfloat = packed<PACKED, float, alignment>; |
|
197 |
|
|
|
198 |
|
template<typename P> |
|
199 |
|
[[nodiscard]] inline pfloat<P> add (const pfloat<P> &a, const pfloat<P> &b) const |
|
200 |
|
{ |
|
201 |
|
return a + (pfloat<P>::cast(pint(_)) & b); |
|
202 |
|
} |
|
203 |
|
|
|
204 |
|
template<typename P> |
|
205 |
|
[[nodiscard]] inline pfloat<P> sub (const pfloat<P> &a, const pfloat<P> &b) const |
|
206 |
|
{ |
|
207 |
|
return a - (pfloat<P>::cast(pint(_)) & b); |
|
208 |
|
} |
|
209 |
|
|
|
210 |
|
[[nodiscard]] inline pint add (const pint &a, const pint &b) const { return a + (pint{_} & b); } |
|
211 |
|
[[nodiscard]] inline pint sub (const pint &a, const pint &b) const { return a - (pint{_} & b); } |
|
212 |
|
|
|
213 |
|
template<typename PF> |
|
214 |
|
[[nodiscard]] inline pfloat<PF> blendv (const pfloat<PF> &if_false, const pfloat<PF> &if_true); |
|
215 |
|
[[nodiscard]] inline pint blendv (const pint &if_false, const pint &if_true); |
|
216 |
|
|
|
217 |
|
MASK _; |
|
218 |
|
} __attribute__((aligned(alignment))); |
|
219 |
|
} |
|
File libs/simd/simd.cpp deleted (index 72b03ed..0000000) |
1 |
|
#include "def.h" |
|
2 |
|
|
|
3 |
|
namespace simd |
|
4 |
|
{ |
|
5 |
|
#ifdef _WIN32 |
|
6 |
|
|
|
7 |
|
#include <intrin.h> |
|
8 |
|
|
|
9 |
|
inline void cpuid(int32_t out[4], int32_t x) |
|
10 |
|
{ |
|
11 |
|
__cpuidex(out, x, 0); |
|
12 |
|
} |
|
13 |
|
inline uint64_t xgetbv(unsigned int x) |
|
14 |
|
{ |
|
15 |
|
return _xgetbv(x); |
|
16 |
|
} |
|
17 |
|
|
|
18 |
|
#else |
|
19 |
|
|
|
20 |
|
#include <cpuid.h> |
|
21 |
|
|
|
22 |
|
inline void cpuid(int32_t out[4], int32_t x) |
|
23 |
|
{ |
|
24 |
|
__cpuid_count(x, 0, out[0], out[1], out[2], out[3]); |
|
25 |
|
} |
|
26 |
|
inline uint64_t xgetbv(unsigned int index) |
|
27 |
|
{ |
|
28 |
|
uint32_t eax, edx; |
|
29 |
|
__asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index)); |
|
30 |
|
return (uint64_t(edx) << 32) | eax; |
|
31 |
|
} |
|
32 |
|
#endif |
|
33 |
|
|
|
34 |
|
|
|
35 |
|
Level level() |
|
36 |
|
{ |
|
37 |
|
//https://github.com/Mysticial/FeatureDetector |
|
38 |
|
|
|
39 |
|
int cpuInfo[4]; |
|
40 |
|
cpuid(cpuInfo, 0); |
|
41 |
|
|
|
42 |
|
int &nIds = cpuInfo[0]; |
|
43 |
|
|
|
44 |
|
if (nIds < 0x00000001) |
|
45 |
|
return NO_SIMD; |
|
46 |
|
|
|
47 |
|
cpuid(cpuInfo, 0x00000001); |
|
48 |
|
|
|
49 |
|
if ((cpuInfo[3] & 1 << 26) == 0) // SSE2 |
|
50 |
|
return NO_SIMD; |
|
51 |
|
|
|
52 |
|
if ((cpuInfo[2] & 1 << 19) == 0) // SSE41 |
|
53 |
|
return SSE2; |
|
54 |
|
// AVX |
|
55 |
|
bool cpuXSaveSuport = (cpuInfo[2] & 1 << 26) != 0; |
|
56 |
|
bool osAVXSuport = (cpuInfo[2] & 1 << 27) != 0; |
|
57 |
|
bool cpuAVXSuport = (cpuInfo[2] & 1 << 28) != 0; |
|
58 |
|
|
|
59 |
|
if (cpuXSaveSuport && osAVXSuport && cpuAVXSuport) |
|
60 |
|
{ |
|
61 |
|
uint64_t xcrFeatureMask = xgetbv(0 /* = XCR_XFEATURE_ENABLED*/); |
|
62 |
|
if ((xcrFeatureMask & 0x6) != 0x6) |
|
63 |
|
return SSE4_1; |
|
64 |
|
} |
|
65 |
|
else return SSE4_1; |
|
66 |
|
|
|
67 |
|
if (nIds < 0x00000007) // AVX2 FMA3 |
|
68 |
|
return SSE4_1; |
|
69 |
|
|
|
70 |
|
cpuid(cpuInfo, 0x00000007); |
|
71 |
|
|
|
72 |
|
bool cpuAVX2Support = (cpuInfo[1] & 1 << 5) != 0; |
|
73 |
|
|
|
74 |
|
if (!cpuAVX2Support) |
|
75 |
|
return SSE4_1; |
|
76 |
|
// AVX512 |
|
77 |
|
bool cpuAVX512Support = (cpuInfo[1] & 1 << 16) != 0; |
|
78 |
|
bool oxAVX512Support = (xgetbv(0 /* = XCR_XFEATURE_ENABLED*/) & 0xe6) == 0xe6; |
|
79 |
|
|
|
80 |
|
if (!cpuAVX512Support || !oxAVX512Support) |
|
81 |
|
return AVX2; |
|
82 |
|
|
|
83 |
|
return AVX512; |
|
84 |
|
} |
|
85 |
|
} |
|
File libs/simd/simd_avx2.inl deleted (index 1efceee..0000000) |
1 |
|
#pragma once |
|
2 |
|
|
|
3 |
|
#ifdef __AVX2__ |
|
4 |
|
|
|
5 |
|
#include "def.h" |
|
6 |
|
|
|
7 |
|
#include <immintrin.h> |
|
8 |
|
|
|
9 |
|
#include <cstring> |
|
10 |
|
|
|
11 |
|
namespace simd |
|
12 |
|
{ |
|
13 |
|
using mask_avx2 = mask <__m256i, AVX2_ALIGNMENT>; |
|
14 |
|
using pfloat_avx2 = packed<__m256, float, AVX2_ALIGNMENT>; |
|
15 |
|
using pint_avx2 = packed<__m256i, int32_t, AVX2_ALIGNMENT>; |
|
16 |
|
|
|
17 |
|
template<> struct get_mask<__m256, AVX2_ALIGNMENT> { using T = mask_avx2; }; |
|
18 |
|
template<> struct get_mask<__m256i, AVX2_ALIGNMENT> { using T = mask_avx2; }; |
|
19 |
|
|
|
20 |
|
#define PFLOAT(return) template<> inline return pfloat_avx2:: |
|
21 |
|
#define PFLOAT2(return) template<> template<> [[nodiscard]] inline return pfloat_avx2:: |
|
22 |
|
|
|
23 |
|
#define PINT(return) template<> inline return pint_avx2:: |
|
24 |
|
#define PINT2(return) template<> template<> [[nodiscard]] inline return pint_avx2:: |
|
25 |
|
|
|
26 |
|
|
|
27 |
|
PFLOAT() packed() : _(_mm256_setzero_ps()) {} |
|
28 |
|
PFLOAT() packed(const float &f) : _(_mm256_set1_ps(f)) {} |
|
29 |
|
#if 0 |
|
30 |
|
PFLOAT() packed(const float *p) : _(_mm256_loadu_ps(p)) {} |
|
31 |
|
PFLOAT(void) extract(float *p) const { _mm256_storeu_ps(p, _); } |
|
32 |
|
#else |
|
33 |
|
PFLOAT() packed(const float *p) : _(_mm256_load_ps(p)) {} |
|
34 |
|
PFLOAT(void) extract(float *p) const { _mm256_store_ps(p, _); } |
|
35 |
|
#endif |
|
36 |
|
PFLOAT(void) extract_u(float *p) const { _mm256_storeu_ps(p, _); } |
|
37 |
|
|
|
38 |
|
|
|
39 |
|
PINT() packed(const int32_t &i) : _(_mm256_set1_epi32(i)) {} |
|
40 |
|
|
|
41 |
|
PFLOAT2(pfloat_avx2) convert(const pint_avx2 &v) { return _mm256_cvtepi32_ps (v._); } |
|
42 |
|
PFLOAT2(pfloat_avx2) cast (const pint_avx2 &v) { return _mm256_castsi256_ps(v._); } |
|
43 |
|
|
|
44 |
|
PINT2(pint_avx2) convert(const pfloat_avx2 &v) { return _mm256_cvtps_epi32 (v._); } |
|
45 |
|
PINT2(pint_avx2) cast (const pfloat_avx2 &v) { return _mm256_castps_si256(v._); } |
|
46 |
|
|
|
47 |
|
PFLOAT(mask_avx2) operator < (const pfloat_avx2 &v) const |
|
48 |
|
{ |
|
49 |
|
return pint_avx2::cast(pfloat_avx2(_mm256_cmp_ps(_, v._, _CMP_LT_OS))); |
|
50 |
|
} |
|
51 |
|
PFLOAT(mask_avx2) operator > (const pfloat_avx2 &v) const |
|
52 |
|
{ |
|
53 |
|
return pint_avx2::cast(pfloat_avx2(_mm256_cmp_ps(_, v._, _CMP_GT_OS))); |
|
54 |
|
} |
|
55 |
|
PFLOAT(mask_avx2) operator <= (const pfloat_avx2 &v) const |
|
56 |
|
{ |
|
57 |
|
return pint_avx2::cast(pfloat_avx2(_mm256_cmp_ps(_, v._, _CMP_LE_OS))); |
|
58 |
|
} |
|
59 |
|
PFLOAT(mask_avx2) operator >= (const pfloat_avx2 &v) const |
|
60 |
|
{ |
|
61 |
|
return pint_avx2::cast(pfloat_avx2(_mm256_cmp_ps(_, v._, _CMP_GE_OS))); |
|
62 |
|
} |
|
63 |
|
PFLOAT(mask_avx2) operator == (const pfloat_avx2 &v) const |
|
64 |
|
{ |
|
65 |
|
return pint_avx2::cast(pfloat_avx2(_mm256_cmp_ps(_, v._, _CMP_EQ_OS))); |
|
66 |
|
} |
|
67 |
|
|
|
68 |
|
PFLOAT(pfloat_avx2) operator + (const pfloat_avx2 &f) const { return _mm256_add_ps(_, f._); } |
|
69 |
|
PFLOAT(pfloat_avx2) operator - (const pfloat_avx2 &f) const { return _mm256_sub_ps(_, f._); } |
|
70 |
|
PFLOAT(pfloat_avx2) operator * (const pfloat_avx2 &f) const { return _mm256_mul_ps(_, f._); } |
|
71 |
|
PFLOAT(pfloat_avx2) operator / (const pfloat_avx2 &f) const { return _mm256_div_ps(_, f._); } |
|
72 |
|
|
|
73 |
|
PFLOAT(pfloat_avx2) operator & (const pfloat_avx2 &f) const { return _mm256_and_ps(_,f._); } |
|
74 |
|
PFLOAT(pfloat_avx2) operator ^ (const pfloat_avx2 &f) const { return _mm256_xor_ps(_,f._); } |
|
75 |
|
|
|
76 |
|
PFLOAT(pfloat_avx2) and_not (const pfloat_avx2 &f) const { return _mm256_andnot_ps(_,f._); } |
|
77 |
|
|
|
78 |
|
PFLOAT(pfloat_avx2) mul_add (const pfloat_avx2 &a, const pfloat_avx2 &b, const pfloat_avx2 &c) |
|
79 |
|
{ |
|
80 |
|
return _mm256_fmadd_ps (a._,b._,c._); |
|
81 |
|
} |
|
82 |
|
PFLOAT(pfloat_avx2) mul_sub (const pfloat_avx2 &a, const pfloat_avx2 &b, const pfloat_avx2 &c) |
|
83 |
|
{ |
|
84 |
|
return _mm256_fmsub_ps (a._,b._,c._); |
|
85 |
|
} |
|
86 |
|
PFLOAT(pfloat_avx2) nmul_add (const pfloat_avx2 &a, const pfloat_avx2 &b, const pfloat_avx2 &c) |
|
87 |
|
{ |
|
88 |
|
return _mm256_fnmadd_ps(a._,b._,c._); |
|
89 |
|
} |
|
90 |
|
|
|
91 |
|
PFLOAT2(pfloat_avx2) permute(const pfloat_avx2 &f, const pint_avx2 &i) |
|
92 |
|
{ |
|
93 |
|
return _mm256_permutevar8x32_ps(f._, i._); |
|
94 |
|
} |
|
95 |
|
PFLOAT(pfloat_avx2) floor() const { return _mm256_floor_ps(_); } |
|
96 |
|
|
|
97 |
|
PFLOAT(pfloat_avx2) rsqrt() const { return _mm256_rsqrt_ps(_); } |
|
98 |
|
|
|
99 |
|
PFLOAT(pfloat_avx2) abs() const { return pfloat_avx2(-0.f).and_not(*this); } |
|
100 |
|
|
|
101 |
|
#undef PFLOAT |
|
102 |
|
#undef PFLOAT2 |
|
103 |
|
|
|
104 |
|
|
|
105 |
|
PINT() packed() : _(_mm256_setzero_si256()) {} |
|
106 |
|
PINT() packed(const int32_t *p) : _(__m256i(_mm256_load_ps(reinterpret_cast<const float*>(p)))) {} |
|
107 |
|
|
|
108 |
|
PINT(void) extract(int32_t *p) const { memcpy(p, &_, sizeof (_)); } |
|
109 |
|
|
|
110 |
|
PINT(pint_avx2) operator + (const pint_avx2 &v) const { return _mm256_add_epi32(_, v._); } |
|
111 |
|
PINT(pint_avx2) operator - (const pint_avx2 &v) const { return _mm256_sub_epi32(_, v._); } |
|
112 |
|
PINT(pint_avx2) operator * (const pint_avx2 &v) const { return _mm256_mullo_epi32(_, v._); } |
|
113 |
|
|
|
114 |
|
PINT(pint_avx2) operator ^ (const pint_avx2 &i) const { return _mm256_xor_si256(_,i._); } |
|
115 |
|
|
|
116 |
|
PINT(pint_avx2) operator | (const pint_avx2 &i) const { return _mm256_or_si256 (_,i._); } |
|
117 |
|
PINT(pint_avx2) operator & (const pint_avx2 &i) const { return _mm256_and_si256(_,i._); } |
|
118 |
|
PINT(pint_avx2) and_not (const pint_avx2 &i) const { return _mm256_andnot_si256(_,i._); } |
|
119 |
|
|
|
120 |
|
static_assert (0xffffffff == uint32_t(-1)); |
|
121 |
|
PINT(pint_avx2) operator ~ () const { return pint_avx2(_) ^ pint_avx2(-1); } |
|
122 |
|
|
|
123 |
|
PINT(mask_avx2) operator > (const pint_avx2 &i) const { return _mm256_cmpgt_epi32(_, i._); } |
|
124 |
|
PINT(mask_avx2) operator < (const pint_avx2 &i) const { return _mm256_cmpgt_epi32(i._, _); } |
|
125 |
|
PINT(mask_avx2) operator == (const pint_avx2 &i) const { return _mm256_cmpeq_epi32(_, i._); } |
|
126 |
|
|
|
127 |
|
|
|
128 |
|
PINT2(pint_avx2) operator << (int count) const { return _mm256_slli_epi32(_, count); } |
|
129 |
|
PINT2(pint_avx2) operator >> (int count) const { return _mm256_srai_epi32(_, count); } |
|
130 |
|
|
|
131 |
|
PINT(pint_avx2) abs() const { return _mm256_abs_epi32(_); } |
|
132 |
|
|
|
133 |
|
#undef PINT |
|
134 |
|
#undef PINT2 |
|
135 |
|
|
|
136 |
|
#define MASK(return) template<> [[nodiscard]] inline return mask_avx2:: |
|
137 |
|
#define MASK2(return) template<> template<> [[nodiscard]] inline return mask_avx2:: |
|
138 |
|
|
|
139 |
|
MASK2(pfloat_avx2) operator &(const pfloat_avx2 &f) const { return pfloat_avx2::cast(pint(_)) & f; } |
|
140 |
|
|
|
141 |
|
MASK2(pfloat_avx2) blendv (const pfloat_avx2 &a, const pfloat_avx2 &b) |
|
142 |
|
{ |
|
143 |
|
return _mm256_blendv_ps (a._, b._, __m256(_)); |
|
144 |
|
} |
|
145 |
|
|
|
146 |
|
MASK(pint_avx2) blendv (const pint_avx2 &a, const pint_avx2 &b ) { return _mm256_blendv_epi8(a._, b._, _); } |
|
147 |
|
|
|
148 |
|
#undef MASK |
|
149 |
|
#undef MASK2 |
|
150 |
|
|
|
151 |
|
} |
|
152 |
|
|
|
153 |
|
#else |
|
154 |
|
|
|
155 |
|
#include "simd_no.inl" |
|
156 |
|
namespace simd |
|
157 |
|
{ |
|
158 |
|
using mask_avx2 = mask_nosimd; |
|
159 |
|
using pfloat_avx2 = pfloat_nosimd; |
|
160 |
|
using pint_avx2 = pint_nosimd; |
|
161 |
|
} |
|
162 |
|
#endif |
|
163 |
|
|
|
164 |
|
|
|
165 |
|
#ifdef FN_COMPILE_AVX512 |
|
166 |
|
#error TODO AVX512 |
|
167 |
|
#include <x86intrin.h> |
|
168 |
|
#endif |
|
File libs/simd/simd_no.inl deleted (index 4a63826..0000000) |
1 |
|
#pragma once |
|
2 |
|
|
|
3 |
|
#include "def.h" |
|
4 |
|
#include <cstring> |
|
5 |
|
#include <cmath> |
|
6 |
|
|
|
7 |
|
namespace simd |
|
8 |
|
{ |
|
9 |
|
using mask_nosimd = mask<int32_t, NO_SIMD_ALIGNMENT>; |
|
10 |
|
using pfloat_nosimd = packed<float, float, NO_SIMD_ALIGNMENT>; |
|
11 |
|
using pint_nosimd = packed<int32_t, int32_t, NO_SIMD_ALIGNMENT >; |
|
12 |
|
|
|
13 |
|
#define PFLOAT(return) template<> inline return pfloat_nosimd:: |
|
14 |
|
#define PFLOAT2(return) template<> template<> [[nodiscard]] inline return pfloat_nosimd:: |
|
15 |
|
|
|
16 |
|
#define PINT(return) template<> inline return pint_nosimd:: |
|
17 |
|
#define PINT2(return) template<> template<> [[nodiscard]] inline return pint_nosimd:: |
|
18 |
|
|
|
19 |
|
|
|
20 |
|
PFLOAT() packed() : _(0) {} |
|
21 |
|
PFLOAT() packed(const float &f) : _(f) {} |
|
22 |
|
PFLOAT() packed(const float *p) : _(*p) {} |
|
23 |
|
|
|
24 |
|
PFLOAT(void) extract_u(float *p) const { *p = _; } |
|
25 |
|
|
|
26 |
|
PFLOAT2(pfloat_nosimd) convert(const pint_nosimd &v) { return float(v._); } |
|
27 |
|
PFLOAT2(pfloat_nosimd) cast (const pint_nosimd &v) { float f; memcpy(&f, &v, sizeof(float)); return f; } |
|
28 |
|
|
|
29 |
|
PINT() packed(const int32_t &i) : _(i) {} |
|
30 |
|
|
|
31 |
|
PINT2(pint_nosimd) convert(const pfloat_nosimd &v) { return int32_t (v._); } |
|
32 |
|
PINT2(pint_nosimd) cast (const pfloat_nosimd &v) { int32_t i; memcpy(&i, &v, sizeof(float)); return i; } |
|
33 |
|
|
|
34 |
|
PFLOAT(mask_nosimd) operator < (const pfloat_nosimd &v) const { return _ < v._ ? -1 : 0; } |
|
35 |
|
PFLOAT(mask_nosimd) operator > (const pfloat_nosimd &v) const { return _ > v._ ? -1 : 0; } |
|
36 |
|
PFLOAT(mask_nosimd) operator <= (const pfloat_nosimd &v) const { return _ <= v._ ? -1 : 0; } |
|
37 |
|
PFLOAT(mask_nosimd) operator >= (const pfloat_nosimd &v) const { return _ >= v._ ? -1 : 0; } |
|
38 |
|
PFLOAT(mask_nosimd) operator == (const pfloat_nosimd &v) const { return memcmp(&_, &v._, sizeof(_)) == 0 ? -1 : 0; } |
|
39 |
|
|
|
40 |
|
PFLOAT(pfloat_nosimd) operator + (const pfloat_nosimd &f) const { return _ + f._; } |
|
41 |
|
PFLOAT(pfloat_nosimd) operator - (const pfloat_nosimd &f) const { return _ - f._; } |
|
42 |
|
PFLOAT(pfloat_nosimd) operator * (const pfloat_nosimd &f) const { return _ * f._; } |
|
43 |
|
PFLOAT(pfloat_nosimd) operator / (const pfloat_nosimd &f) const { return _ / f._; } |
|
44 |
|
|
|
45 |
|
union float_int { float f; int32_t i; }; |
|
46 |
|
|
|
47 |
|
PFLOAT(pfloat_nosimd) operator & (const pfloat_nosimd &f) const |
|
48 |
|
{ |
|
49 |
|
float_int a{_}, b{f._}; |
|
50 |
|
float_int r; |
|
51 |
|
r.i = a.i & b.i; |
|
52 |
|
return r.f; |
|
53 |
|
} |
|
54 |
|
PFLOAT(pfloat_nosimd) operator ^ (const pfloat_nosimd &f) const |
|
55 |
|
{ |
|
56 |
|
float_int a{_}, b{f._}; |
|
57 |
|
float_int r; |
|
58 |
|
r.i = a.i ^ b.i; |
|
59 |
|
return r.f; |
|
60 |
|
} |
|
61 |
|
|
|
62 |
|
PFLOAT(pfloat_nosimd) and_not (const pfloat_nosimd &f) const |
|
63 |
|
{ |
|
64 |
|
float_int a{_}, b{f._}; |
|
65 |
|
float_int r; |
|
66 |
|
r.i = ~ a.i & b.i; |
|
67 |
|
return r.f; |
|
68 |
|
} |
|
69 |
|
|
|
70 |
|
PFLOAT(pfloat_nosimd) mul_add (const pfloat_nosimd &a, const pfloat_nosimd &b, const pfloat_nosimd &c) |
|
71 |
|
{ |
|
72 |
|
return a * b + c; |
|
73 |
|
} |
|
74 |
|
PFLOAT(pfloat_nosimd) mul_sub (const pfloat_nosimd &a, const pfloat_nosimd &b, const pfloat_nosimd &c) |
|
75 |
|
{ |
|
76 |
|
return a * b - c; |
|
77 |
|
} |
|
78 |
|
PFLOAT(pfloat_nosimd) nmul_add (const pfloat_nosimd &a, const pfloat_nosimd &b, const pfloat_nosimd &c) |
|
79 |
|
{ |
|
80 |
|
return c - a * b; |
|
81 |
|
} |
|
82 |
|
|
|
83 |
|
PFLOAT(pfloat_nosimd) floor() const { return std::floor(_); } |
|
84 |
|
|
|
85 |
|
PFLOAT(pfloat_nosimd) rsqrt() const |
|
86 |
|
{ |
|
87 |
|
float_int num {_}; |
|
88 |
|
|
|
89 |
|
num.i = 0x5f3759df - (num.i >> 1); |
|
90 |
|
|
|
91 |
|
float xhalf = 0.5f * _; |
|
92 |
|
num.f = num.f*(1.5f - xhalf*num.f*num.f); |
|
93 |
|
return num.f; |
|
94 |
|
} |
|
95 |
|
|
|
96 |
|
PFLOAT(pfloat_nosimd) abs() const { return std::abs(_); } |
|
97 |
|
|
|
98 |
|
#undef PFLOAT |
|
99 |
|
#undef PFLOAT2 |
|
100 |
|
|
|
101 |
|
|
|
102 |
|
PINT() packed() : _(0) {} |
|
103 |
|
PINT() packed(const int32_t *p) : _(*p) {} |
|
104 |
|
|
|
105 |
|
PINT(void) extract(int32_t *p) const { *p = _; } |
|
106 |
|
|
|
107 |
|
|
|
108 |
|
PINT(pint_nosimd) operator + (const pint_nosimd &v) const { return _ + v._; } |
|
109 |
|
PINT(pint_nosimd) operator - (const pint_nosimd &v) const { return _ - v._; } |
|
110 |
|
PINT(pint_nosimd) operator * (const pint_nosimd &v) const { return _ * v._; } |
|
111 |
|
|
|
112 |
|
PINT(pint_nosimd) operator ^ (const pint_nosimd &i) const { return _ ^ i._; } |
|
113 |
|
|
|
114 |
|
PINT(pint_nosimd) operator | (const pint_nosimd &i) const { return _ | i._; } |
|
115 |
|
PINT(pint_nosimd) operator & (const pint_nosimd &i) const { return _ & i._; } |
|
116 |
|
PINT(pint_nosimd) and_not (const pint_nosimd &i) const { return ~_ & i._; } |
|
117 |
|
|
|
118 |
|
static_assert (0xffffffff == uint32_t(-1)); |
|
119 |
|
PINT(pint_nosimd) operator ~ () const { return ~_; } |
|
120 |
|
|
|
121 |
|
PINT(mask_nosimd) operator > (const pint_nosimd &i) const { return _ > i._ ? -1 : 0; } |
|
122 |
|
PINT(mask_nosimd) operator < (const pint_nosimd &i) const { return _ < i._ ? -1 : 0; } |
|
123 |
|
PINT(mask_nosimd) operator == (const pint_nosimd &i) const { return _ == i._ ? -1 : 0; } |
|
124 |
|
|
|
125 |
|
PINT2(pint_nosimd) operator << (int count) const { return _ << count; } |
|
126 |
|
PINT2(pint_nosimd) operator >> (int count) const { return _ >> count; } |
|
127 |
|
|
|
128 |
|
PINT(pint_nosimd) abs() const { return std::abs(_); } |
|
129 |
|
|
|
130 |
|
#undef PINT |
|
131 |
|
#undef PINT2 |
|
132 |
|
|
|
133 |
|
#define MASK(return) template<> inline return mask_nosimd:: |
|
134 |
|
#define MASK2(return) template<> template<> inline return mask_nosimd:: |
|
135 |
|
|
|
136 |
|
MASK2(pfloat_nosimd) operator &(const pfloat_nosimd &f) const { return pfloat_nosimd::cast(pint_nosimd(_)) & f; } |
|
137 |
|
|
|
138 |
|
MASK2(pfloat_nosimd) blendv (const pfloat_nosimd &a, const pfloat_nosimd &b) { return _ ? b : a; } |
|
139 |
|
|
|
140 |
|
MASK(pint_nosimd) blendv (const pint_nosimd &a, const pint_nosimd &b) { return _ ? b : a; } |
|
141 |
|
|
|
142 |
|
#undef MASK |
|
143 |
|
#undef MASK2 |
|
144 |
|
} |
|
File libs/simd/simd_sse4.2.inl deleted (index 04f1b09..0000000) |
1 |
|
#pragma once |
|
2 |
|
|
|
3 |
|
#if defined __SSE2__ or defined __SSE4_2__ |
|
4 |
|
|
|
5 |
|
#include "def.h" |
|
6 |
|
|
|
7 |
|
#include <smmintrin.h> |
|
8 |
|
|
|
9 |
|
#include <cstring> |
|
10 |
|
|
|
11 |
|
namespace simd |
|
12 |
|
{ |
|
13 |
|
constexpr const int SSE4_ALIGNMENT = 16; |
|
14 |
|
using mask_sse4 = mask <__m128i, SSE4_ALIGNMENT>; |
|
15 |
|
using pfloat_sse4 = packed<__m128, float, SSE4_ALIGNMENT>; |
|
16 |
|
using pint_sse4 = packed<__m128i, int32_t, SSE4_ALIGNMENT>; |
|
17 |
|
|
|
18 |
|
template<> struct get_mask<__m128, SSE4_ALIGNMENT> { using T = mask_sse4; }; |
|
19 |
|
template<> struct get_mask<__m128i, SSE4_ALIGNMENT> { using T = mask_sse4; }; |
|
20 |
|
|
|
21 |
|
#define PFLOAT(return) template<> inline return pfloat_sse4:: |
|
22 |
|
#define PFLOAT2(return) template<> template<> [[nodiscard]] inline return pfloat_sse4:: |
|
23 |
|
|
|
24 |
|
#define PINT(return) template<> inline return pint_sse4:: |
|
25 |
|
#define PINT2(return) template<> template<> [[nodiscard]] inline return pint_sse4:: |
|
26 |
|
|
|
27 |
|
PFLOAT() packed() : _(_mm_setzero_ps()) {} |
|
28 |
|
PFLOAT() packed(const float &f) : _(_mm_set1_ps(f)) {} |
|
29 |
|
PFLOAT() packed(const float *p) : _(_mm_load_ps(p)) {} |
|
30 |
|
|
|
31 |
|
PFLOAT(void) extract(float *p) const { _mm_store_ps(p, _); } |
|
32 |
|
PFLOAT(void) extract_u(float *p) const { _mm_storeu_ps(p, _); } |
|
33 |
|
|
|
34 |
|
PINT() packed(const int32_t &i) : _(_mm_set1_epi32(i)) {} |
|
35 |
|
|
|
36 |
|
PFLOAT2(pfloat_sse4) convert(const pint_sse4 &v) { return _mm_cvtepi32_ps (v._); } |
|
37 |
|
PFLOAT2(pfloat_sse4) cast (const pint_sse4 &v) { return _mm_castsi128_ps(v._); } |
|
38 |
|
|
|
39 |
|
PINT2(pint_sse4) convert(const pfloat_sse4 &v) { return _mm_cvtps_epi32 (v._); } |
|
40 |
|
PINT2(pint_sse4) cast (const pfloat_sse4 &v) { return _mm_castps_si128(v._); } |
|
41 |
|
|
|
42 |
|
PFLOAT(mask_sse4) operator < (const pfloat_sse4 &v) const |
|
43 |
|
{ |
|
44 |
|
return pint_sse4::cast(pfloat_sse4(_mm_cmplt_ps(_, v._))); |
|
45 |
|
} |
|
46 |
|
PFLOAT(mask_sse4) operator > (const pfloat_sse4 &v) const |
|
47 |
|
{ |
|
48 |
|
return pint_sse4::cast(pfloat_sse4(_mm_cmpgt_ps(_, v._))); |
|
49 |
|
} |
|
50 |
|
PFLOAT(mask_sse4) operator <= (const pfloat_sse4 &v) const |
|
51 |
|
{ |
|
52 |
|
return pint_sse4::cast(pfloat_sse4(_mm_cmple_ps(_, v._))); |
|
53 |
|
} |
|
54 |
|
PFLOAT(mask_sse4) operator >= (const pfloat_sse4 &v) const |
|
55 |
|
{ |
|
56 |
|
return pint_sse4::cast(pfloat_sse4(_mm_cmpge_ps(_, v._))); |
|
57 |
|
} |
|
58 |
|
PFLOAT(mask_sse4) operator == (const pfloat_sse4 &v) const |
|
59 |
|
{ |
|
60 |
|
return pint_sse4::cast(pfloat_sse4(_mm_cmpeq_ps(_, v._))); |
|
61 |
|
} |
|
62 |
|
|
|
63 |
|
PFLOAT(pfloat_sse4) operator + (const pfloat_sse4 &f) const { return _mm_add_ps(_, f._); } |
|
64 |
|
PFLOAT(pfloat_sse4) operator - (const pfloat_sse4 &f) const { return _mm_sub_ps(_, f._); } |
|
65 |
|
PFLOAT(pfloat_sse4) operator * (const pfloat_sse4 &f) const { return _mm_mul_ps(_, f._); } |
|
66 |
|
PFLOAT(pfloat_sse4) operator / (const pfloat_sse4 &f) const { return _mm_div_ps(_, f._); } |
|
67 |
|
|
|
68 |
|
PFLOAT(pfloat_sse4) operator & (const pfloat_sse4 &f) const { return _mm_and_ps(_,f._); } |
|
69 |
|
PFLOAT(pfloat_sse4) operator ^ (const pfloat_sse4 &f) const { return _mm_xor_ps(_,f._); } |
|
70 |
|
|
|
71 |
|
PFLOAT(pfloat_sse4) and_not (const pfloat_sse4 &f) const { return _mm_andnot_ps(_,f._); } |
|
72 |
|
|
|
73 |
|
PFLOAT(pfloat_sse4) mul_add (const pfloat_sse4 &a, const pfloat_sse4 &b, const pfloat_sse4 &c) |
|
74 |
|
{ |
|
75 |
|
return c + a * b; |
|
76 |
|
} |
|
77 |
|
PFLOAT(pfloat_sse4) nmul_add (const pfloat_sse4 &a, const pfloat_sse4 &b, const pfloat_sse4 &c) |
|
78 |
|
{ |
|
79 |
|
return c - a * b; |
|
80 |
|
} |
|
81 |
|
PFLOAT(pfloat_sse4) mul_sub (const pfloat_sse4 &a, const pfloat_sse4 &b, const pfloat_sse4 &c) |
|
82 |
|
{ |
|
83 |
|
return a * b - c; |
|
84 |
|
} |
|
85 |
|
|
|
86 |
|
PFLOAT(pfloat_sse4) floor() const { return _mm_floor_ps(_); } |
|
87 |
|
|
|
88 |
|
PFLOAT(pfloat_sse4) rsqrt() const { return _mm_rsqrt_ps(_); } |
|
89 |
|
|
|
90 |
|
PFLOAT(pfloat_sse4) abs() const { return pfloat_sse4(-0.f).and_not(*this); } |
|
91 |
|
|
|
92 |
|
#undef PFLOAT |
|
93 |
|
#undef PFLOAT2 |
|
94 |
|
|
|
95 |
|
|
|
96 |
|
PINT() packed() : _(_mm_setzero_si128()) {} |
|
97 |
|
PINT() packed(const int32_t *p) : _(__m128i(_mm_load_ps(reinterpret_cast<const float*>(p)))) {} |
|
98 |
|
|
|
99 |
|
PINT(void) extract(int32_t *p) const { memcpy(p, &_, sizeof (_)); } |
|
100 |
|
PINT(void) extract_u(int32_t *p) const { memcpy(p, &_, sizeof (_)); } |
|
101 |
|
|
|
102 |
|
PINT(pint_sse4) operator + (const pint_sse4 &v) const { return _mm_add_epi32(_, v._); } |
|
103 |
|
PINT(pint_sse4) operator - (const pint_sse4 &v) const { return _mm_sub_epi32(_, v._); } |
|
104 |
|
PINT(pint_sse4) operator * (const pint_sse4 &v) const { return _mm_mullo_epi32(_, v._); } |
|
105 |
|
|
|
106 |
|
PINT(pint_sse4) operator ^ (const pint_sse4 &i) const { return _mm_xor_si128(_,i._); } |
|
107 |
|
|
|
108 |
|
PINT(pint_sse4) operator | (const pint_sse4 &i) const { return _mm_or_si128 (_,i._); } |
|
109 |
|
PINT(pint_sse4) operator & (const pint_sse4 &i) const { return _mm_and_si128(_,i._); } |
|
110 |
|
PINT(pint_sse4) and_not (const pint_sse4 &i) const { return _mm_andnot_si128(_,i._); } |
|
111 |
|
|
|
112 |
|
static_assert (0xffffffff == uint32_t(-1)); |
|
113 |
|
PINT(pint_sse4) operator ~ () const { return pint_sse4(_) ^ pint_sse4(-1); } |
|
114 |
|
|
|
115 |
|
PINT(mask_sse4) operator < (const pint_sse4 &i) const { return _mm_cmplt_epi32(_, i._); } |
|
116 |
|
PINT(mask_sse4) operator > (const pint_sse4 &i) const { return _mm_cmpgt_epi32(_, i._); } |
|
117 |
|
PINT(mask_sse4) operator == (const pint_sse4 &i) const { return _mm_cmpeq_epi32(_, i._); } |
|
118 |
|
|
|
119 |
|
PINT2(pint_sse4) operator << (int count) const { return _mm_slli_epi32(_, count); } |
|
120 |
|
PINT2(pint_sse4) operator >> (int count) const { return _mm_srai_epi32(_, count); } |
|
121 |
|
|
|
122 |
|
PINT(pint_sse4) abs() const { return _mm_abs_epi32(_); } |
|
123 |
|
#undef PINT |
|
124 |
|
#undef PINT2 |
|
125 |
|
|
|
126 |
|
|
|
127 |
|
#define MASK(return) template<> inline return mask_sse4:: |
|
128 |
|
#define MASK2(return) template<> template<> [[nodiscard]] inline return mask_sse4:: |
|
129 |
|
|
|
130 |
|
MASK2(pfloat_sse4) operator &(const pfloat_sse4 &f) const { return pfloat_sse4::cast(pint(_)) & f; } |
|
131 |
|
|
|
132 |
|
MASK2(pfloat_sse4) blendv (const pfloat_sse4 &a, const pfloat_sse4 &b) |
|
133 |
|
{ |
|
134 |
|
return _mm_blendv_ps(a._, b._, pfloat_sse4::cast(pint(_))._); |
|
135 |
|
} |
|
136 |
|
|
|
137 |
|
MASK(pint_sse4) blendv (const pint_sse4 &a, const pint_sse4 &b ) { return _mm_blendv_epi8(a._, b._, _); } |
|
138 |
|
#undef MASK |
|
139 |
|
#undef MASK2 |
|
140 |
|
|
|
141 |
|
} |
|
142 |
|
#endif |
|