1 #ifndef OPENMM_VECTORIZE_NEON_H_
2 #define OPENMM_VECTORIZE_NEON_H_
35 #include <cpu-features.h>
46 static bool isVec4Supported() {
47 uint64_t features = android_getCpuFeatures();
48 return (features & ANDROID_CPU_ARM_FEATURE_NEON) != 0;
62 fvec4(
float v1,
float v2,
float v3,
float v4) {
63 float v[] = {v1, v2, v3, v4};
68 operator float32x4_t()
const {
74 return vgetq_lane_f32(
val, 0);
76 return vgetq_lane_f32(
val, 1);
78 return vgetq_lane_f32(
val, 2);
80 return vgetq_lane_f32(
val, 3);
88 return vaddq_f32(
val, other);
91 return vsubq_f32(
val, other);
94 return vmulq_f32(
val, other);
99 float32x4_t reciprocal = vrecpeq_f32(other);
100 reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);
101 reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);
102 fvec4 result = vmulq_f32(
val,reciprocal);
106 val = vaddq_f32(
val, other);
109 val = vsubq_f32(
val, other);
112 val = vmulq_f32(
val, other);
118 return vnegq_f32(
val);
121 return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(
val), vreinterpretq_u32_f32(other)));
124 return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(
val), vreinterpretq_u32_f32(other)));
127 return vcvtq_f32_s32(vreinterpretq_s32_u32(vceqq_f32(
val, other)));
130 return vcvtq_f32_s32(vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(
val, other))));
133 return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgtq_f32(
val, other)));
136 return vcvtq_f32_s32(vreinterpretq_s32_u32(vcltq_f32(
val, other)));
139 return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgeq_f32(
val, other)));
142 return vcvtq_f32_s32(vreinterpretq_s32_u32(vcleq_f32(
val, other)));
144 operator ivec4()
const;
157 ivec4(
int v1,
int v2,
int v3,
int v4) {
158 int v[] = {v1, v2, v3, v4};
163 operator int32x4_t()
const {
169 return vgetq_lane_s32(
val, 0);
171 return vgetq_lane_s32(
val, 1);
173 return vgetq_lane_s32(
val, 2);
175 return vgetq_lane_s32(
val, 3);
183 return vaddq_s32(
val, other);
186 return vsubq_s32(
val, other);
189 return vmulq_s32(
val, other);
192 val = vaddq_s32(
val, other);
195 val = vsubq_s32(
val, other);
198 val = vmulq_s32(
val, other);
201 return vnegq_s32(
val);
204 return vandq_s32(
val, other);
207 return vorrq_s32(
val, other);
210 return vreinterpretq_s32_u32(vceqq_s32(
val, other));
213 return vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(
val, other)));
216 return vreinterpretq_s32_u32(vcgtq_s32(
val, other));
219 return vreinterpretq_s32_u32(vcltq_s32(
val, other));
222 return vreinterpretq_s32_u32(vcgeq_s32(
val, other));
225 return vreinterpretq_s32_u32(vcleq_s32(
val, other));
227 operator fvec4()
const;
232 inline fvec4::operator
ivec4()
const {
233 return ivec4(vcvtq_s32_f32(val));
236 inline ivec4::operator
fvec4()
const {
237 return fvec4(vcvtq_f32_s32(val));
243 return vminq_f32(v1, v2);
247 return vmaxq_f32(v1, v2);
255 float32x4_t recipSqrt = vrsqrteq_f32(v);
256 recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
257 recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
258 return vmulq_f32(v, recipSqrt);
261 static inline float dot3(
const fvec4& v1,
const fvec4& v2) {
262 fvec4 result = v1*v2;
263 return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2);
266 static inline float dot4(
const fvec4& v1,
const fvec4& v2) {
267 fvec4 result = v1*v2;
268 return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);
272 float32x4x2_t t1 = vuzpq_f32(v1, v3);
273 float32x4x2_t t2 = vuzpq_f32(v2, v4);
274 float32x4x2_t t3 = vtrnq_f32(t1.val[0], t2.val[0]);
275 float32x4x2_t t4 = vtrnq_f32(t1.val[1], t2.val[1]);
285 return vminq_s32(v1, v2);
289 return vmaxq_s32(v1, v2);
293 return vabdq_s32(v,
ivec4(0));
296 static inline bool any(
const ivec4& v) {
297 return (vgetq_lane_s32(v, 0) != 0 || vgetq_lane_s32(v, 1) != 0 || vgetq_lane_s32(v, 2) != 0 || vgetq_lane_s32(v, 3) != 0);
302 static inline fvec4 operator+(
float v1,
const fvec4& v2) {
306 static inline fvec4 operator-(
float v1,
const fvec4& v2) {
310 static inline fvec4 operator*(
float v1,
const fvec4& v2) {
314 static inline fvec4 operator/(
float v1,
const fvec4& v2) {
321 return vbslq_f32(vreinterpretq_u32_s32(mask), v2, v1);
327 fvec4 shift(0x1.0p23f);
328 fvec4 absResult = (abs(v)+shift)-shift;
329 return blend(v, absResult,
ivec4(0x7FFFFFFF));
333 fvec4 rounded = round(v);
334 return rounded + blend(0.0f, -1.0f, rounded>v);
338 fvec4 rounded = round(v);
339 return rounded + blend(0.0f, 1.0f, rounded<v);
fvec4 operator>=(const fvec4 &other) const
Definition: vectorize_neon.h:138
fvec4 operator+(const fvec4 &other) const
Definition: vectorize_neon.h:87
A four element vector of ints.
Definition: vectorize_neon.h:150
ivec4 operator==(const ivec4 &other) const
Definition: vectorize_neon.h:209
int operator[](int i) const
Definition: vectorize_neon.h:166
fvec4 operator-(const fvec4 &other) const
Definition: vectorize_neon.h:90
float32x4_t val
Definition: vectorize_neon.h:58
float operator[](int i) const
Definition: vectorize_neon.h:71
ivec4 operator<(const ivec4 &other) const
Definition: vectorize_neon.h:218
fvec4 operator<(const fvec4 &other) const
Definition: vectorize_neon.h:135
void operator/=(const fvec4 &other)
Definition: vectorize_neon.h:114
void operator*=(const fvec4 &other)
Definition: vectorize_neon.h:111
fvec4(const float *v)
Definition: vectorize_neon.h:67
ivec4 operator-(const ivec4 &other) const
Definition: vectorize_neon.h:185
void store(int *v) const
Definition: vectorize_neon.h:179
ivec4 operator>=(const ivec4 &other) const
Definition: vectorize_neon.h:221
fvec4(float32x4_t v)
Definition: vectorize_neon.h:66
A four element vector of floats.
Definition: vectorize_neon.h:56
ivec4 operator!=(const ivec4 &other) const
Definition: vectorize_neon.h:212
ivec4(int v1, int v2, int v3, int v4)
Definition: vectorize_neon.h:157
ivec4(const int *v)
Definition: vectorize_neon.h:162
fvec4 operator==(const fvec4 &other) const
Definition: vectorize_neon.h:126
fvec4 operator<=(const fvec4 &other) const
Definition: vectorize_neon.h:141
void store(float *v) const
Definition: vectorize_neon.h:84
void operator-=(const ivec4 &other)
Definition: vectorize_neon.h:194
void operator+=(const ivec4 &other)
Definition: vectorize_neon.h:191
ivec4 operator+(const ivec4 &other) const
Definition: vectorize_neon.h:182
fvec4()
Definition: vectorize_neon.h:60
ivec4 operator-() const
Definition: vectorize_neon.h:200
fvec4 operator/(const fvec4 &other) const
Definition: vectorize_neon.h:96
ivec4 operator|(const ivec4 &other) const
Definition: vectorize_neon.h:206
fvec4 operator!=(const fvec4 &other) const
Definition: vectorize_neon.h:129
ivec4 operator>(const ivec4 &other) const
Definition: vectorize_neon.h:215
ivec4()
Definition: vectorize_neon.h:155
void operator-=(const fvec4 &other)
Definition: vectorize_neon.h:108
fvec4 operator&(const fvec4 &other) const
Definition: vectorize_neon.h:120
fvec4 operator*(const fvec4 &other) const
Definition: vectorize_neon.h:93
fvec4(float v1, float v2, float v3, float v4)
Definition: vectorize_neon.h:62
fvec4 operator|(const fvec4 &other) const
Definition: vectorize_neon.h:123
fvec4(float v)
Definition: vectorize_neon.h:61
fvec4 operator>(const fvec4 &other) const
Definition: vectorize_neon.h:132
fvec4 operator-() const
Definition: vectorize_neon.h:117
void operator+=(const fvec4 &other)
Definition: vectorize_neon.h:105
ivec4(int v)
Definition: vectorize_neon.h:156
ivec4 operator*(const ivec4 &other) const
Definition: vectorize_neon.h:188
ivec4 operator<=(const ivec4 &other) const
Definition: vectorize_neon.h:224
ivec4(int32x4_t v)
Definition: vectorize_neon.h:161
ivec4 operator&(const ivec4 &other) const
Definition: vectorize_neon.h:203
void operator*=(const ivec4 &other)
Definition: vectorize_neon.h:197
int32x4_t val
Definition: vectorize_neon.h:153