OpenMM
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends Pages
vectorize_neon.h
1 #ifndef OPENMM_VECTORIZE_NEON_H_
2 #define OPENMM_VECTORIZE_NEON_H_
3 
4 /* -------------------------------------------------------------------------- *
5  * OpenMM *
6  * -------------------------------------------------------------------------- *
7  * This is part of the OpenMM molecular simulation toolkit originating from *
8  * Simbios, the NIH National Center for Physics-Based Simulation of *
9  * Biological Structures at Stanford, funded under the NIH Roadmap for *
10  * Medical Research, grant U54 GM072970. See https://simtk.org. *
11  * *
12  * Portions copyright (c) 2013-2014 Stanford University and the Authors. *
13  * Authors: Mateus Lima, Peter Eastman *
14  * Contributors: *
15  * *
16  * Permission is hereby granted, free of charge, to any person obtaining a *
17  * copy of this software and associated documentation files (the "Software"), *
18  * to deal in the Software without restriction, including without limitation *
19  * the rights to use, copy, modify, merge, publish, distribute, sublicense, *
20  * and/or sell copies of the Software, and to permit persons to whom the *
21  * Software is furnished to do so, subject to the following conditions: *
22  * *
23  * The above copyright notice and this permission notice shall be included in *
24  * all copies or substantial portions of the Software. *
25  * *
26  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
27  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
28  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
29  * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
30  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
31  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
32  * USE OR OTHER DEALINGS IN THE SOFTWARE. *
33  * -------------------------------------------------------------------------- */
34 
35 #include <cpu-features.h>
36 #include <arm_neon.h>
37 #include <cmath>
38 
39 typedef int int32_t;
40 
41 // This file defines classes and functions to simplify vectorizing code with NEON.
42 
46 static bool isVec4Supported() {
47  uint64_t features = android_getCpuFeatures();
48  return (features & ANDROID_CPU_ARM_FEATURE_NEON) != 0;
49 }
50 
51 class ivec4;
52 
56 class fvec4 {
57 public:
58  float32x4_t val;
59 
60  fvec4() {}
61  fvec4(float v) : val(vdupq_n_f32(v)) {}
62  fvec4(float v1, float v2, float v3, float v4) {
63  float v[] = {v1, v2, v3, v4};
64  val = vld1q_f32(v);
65  }
66  fvec4(float32x4_t v) : val(v) {}
67  fvec4(const float* v) : val(vld1q_f32(v)) {}
68  operator float32x4_t() const {
69  return val;
70  }
71  float operator[](int i) const {
72  switch (i) {
73  case 0:
74  return vgetq_lane_f32(val, 0);
75  case 1:
76  return vgetq_lane_f32(val, 1);
77  case 2:
78  return vgetq_lane_f32(val, 2);
79  case 3:
80  return vgetq_lane_f32(val, 3);
81  }
82  return 0.0f;
83  }
84  void store(float* v) const {
85  vst1q_f32(v, val);
86  }
87  fvec4 operator+(const fvec4& other) const {
88  return vaddq_f32(val, other);
89  }
90  fvec4 operator-(const fvec4& other) const {
91  return vsubq_f32(val, other);
92  }
93  fvec4 operator*(const fvec4& other) const {
94  return vmulq_f32(val, other);
95  }
96  fvec4 operator/(const fvec4& other) const {
97  // NEON does not have a divide float-point operator, so we get the reciprocal and multiply.
98 
99  float32x4_t reciprocal = vrecpeq_f32(other);
100  reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);
101  reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);
102  fvec4 result = vmulq_f32(val,reciprocal);
103  return result;
104  }
105  void operator+=(const fvec4& other) {
106  val = vaddq_f32(val, other);
107  }
108  void operator-=(const fvec4& other) {
109  val = vsubq_f32(val, other);
110  }
111  void operator*=(const fvec4& other) {
112  val = vmulq_f32(val, other);
113  }
114  void operator/=(const fvec4& other) {
115  val = *this/other;
116  }
117  fvec4 operator-() const {
118  return vnegq_f32(val);
119  }
120  fvec4 operator&(const fvec4& other) const {
121  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));
122  }
123  fvec4 operator|(const fvec4& other) const {
124  return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));
125  }
126  fvec4 operator==(const fvec4& other) const {
127  return vcvtq_f32_s32(vreinterpretq_s32_u32(vceqq_f32(val, other)));
128  }
129  fvec4 operator!=(const fvec4& other) const {
130  return vcvtq_f32_s32(vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(val, other)))); // not(equals(val, other))
131  }
132  fvec4 operator>(const fvec4& other) const {
133  return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgtq_f32(val, other)));
134  }
135  fvec4 operator<(const fvec4& other) const {
136  return vcvtq_f32_s32(vreinterpretq_s32_u32(vcltq_f32(val, other)));
137  }
138  fvec4 operator>=(const fvec4& other) const {
139  return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgeq_f32(val, other)));
140  }
141  fvec4 operator<=(const fvec4& other) const {
142  return vcvtq_f32_s32(vreinterpretq_s32_u32(vcleq_f32(val, other)));
143  }
144  operator ivec4() const;
145 };
146 
150 class ivec4 {
151 public:
152 
153  int32x4_t val;
154 
155  ivec4() {}
156  ivec4(int v) : val(vdupq_n_s32(v)) {}
157  ivec4(int v1, int v2, int v3, int v4) {
158  int v[] = {v1, v2, v3, v4};
159  val = vld1q_s32(v);
160  }
161  ivec4(int32x4_t v) : val(v) {}
162  ivec4(const int* v) : val(vld1q_s32(v)) {}
163  operator int32x4_t() const {
164  return val;
165  }
166  int operator[](int i) const {
167  switch (i) {
168  case 0:
169  return vgetq_lane_s32(val, 0);
170  case 1:
171  return vgetq_lane_s32(val, 1);
172  case 2:
173  return vgetq_lane_s32(val, 2);
174  case 3:
175  return vgetq_lane_s32(val, 3);
176  }
177  return 0;
178  }
179  void store(int* v) const {
180  vst1q_s32(v, val);
181  }
182  ivec4 operator+(const ivec4& other) const {
183  return vaddq_s32(val, other);
184  }
185  ivec4 operator-(const ivec4& other) const {
186  return vsubq_s32(val, other);
187  }
188  ivec4 operator*(const ivec4& other) const {
189  return vmulq_s32(val, other);
190  }
191  void operator+=(const ivec4& other) {
192  val = vaddq_s32(val, other);
193  }
194  void operator-=(const ivec4& other) {
195  val = vsubq_s32(val, other);
196  }
197  void operator*=(const ivec4& other) {
198  val = vmulq_s32(val, other);
199  }
200  ivec4 operator-() const {
201  return vnegq_s32(val);
202  }
203  ivec4 operator&(const ivec4& other) const {
204  return vandq_s32(val, other);
205  }
206  ivec4 operator|(const ivec4& other) const {
207  return vorrq_s32(val, other);
208  }
209  ivec4 operator==(const ivec4& other) const {
210  return vreinterpretq_s32_u32(vceqq_s32(val, other));
211  }
212  ivec4 operator!=(const ivec4& other) const {
213  return vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(val, other))); // not(equal(val, other))
214  }
215  ivec4 operator>(const ivec4& other) const {
216  return vreinterpretq_s32_u32(vcgtq_s32(val, other));
217  }
218  ivec4 operator<(const ivec4& other) const {
219  return vreinterpretq_s32_u32(vcltq_s32(val, other));
220  }
221  ivec4 operator>=(const ivec4& other) const {
222  return vreinterpretq_s32_u32(vcgeq_s32(val, other));
223  }
224  ivec4 operator<=(const ivec4& other) const {
225  return vreinterpretq_s32_u32(vcleq_s32(val, other));
226  }
227  operator fvec4() const;
228 };
229 
230 // Conversion operators.
231 
232 inline fvec4::operator ivec4() const {
233  return ivec4(vcvtq_s32_f32(val));
234 }
235 
236 inline ivec4::operator fvec4() const {
237  return fvec4(vcvtq_f32_s32(val));
238 }
239 
240 // Functions that operate on fvec4s.
241 
242 static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
243  return vminq_f32(v1, v2);
244 }
245 
246 static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
247  return vmaxq_f32(v1, v2);
248 }
249 
250 static inline fvec4 abs(const fvec4& v) {
251  return vabsq_f32(v);
252 }
253 
254 static inline fvec4 sqrt(const fvec4& v) {
255  float32x4_t recipSqrt = vrsqrteq_f32(v);
256  recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
257  recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
258  return vmulq_f32(v, recipSqrt);
259 }
260 
261 static inline float dot3(const fvec4& v1, const fvec4& v2) {
262  fvec4 result = v1*v2;
263  return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2);
264 }
265 
266 static inline float dot4(const fvec4& v1, const fvec4& v2) {
267  fvec4 result = v1*v2;
268  return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);
269 }
270 
271 static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
272  float32x4x2_t t1 = vuzpq_f32(v1, v3);
273  float32x4x2_t t2 = vuzpq_f32(v2, v4);
274  float32x4x2_t t3 = vtrnq_f32(t1.val[0], t2.val[0]);
275  float32x4x2_t t4 = vtrnq_f32(t1.val[1], t2.val[1]);
276  v1 = t3.val[0];
277  v2 = t4.val[0];
278  v3 = t3.val[1];
279  v4 = t4.val[1];
280 }
281 
282 // Functions that operate on ivec4s.
283 
284 static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
285  return vminq_s32(v1, v2);
286 }
287 
288 static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
289  return vmaxq_s32(v1, v2);
290 }
291 
292 static inline ivec4 abs(const ivec4& v) {
293  return vabdq_s32(v, ivec4(0));
294 }
295 
296 static inline bool any(const ivec4& v) {
297  return (vgetq_lane_s32(v, 0) != 0 || vgetq_lane_s32(v, 1) != 0 || vgetq_lane_s32(v, 2) != 0 || vgetq_lane_s32(v, 3) != 0);
298 }
299 
300 // Mathematical operators involving a scalar and a vector.
301 
302 static inline fvec4 operator+(float v1, const fvec4& v2) {
303  return fvec4(v1)+v2;
304 }
305 
306 static inline fvec4 operator-(float v1, const fvec4& v2) {
307  return fvec4(v1)-v2;
308 }
309 
310 static inline fvec4 operator*(float v1, const fvec4& v2) {
311  return fvec4(v1)*v2;
312 }
313 
314 static inline fvec4 operator/(float v1, const fvec4& v2) {
315  return fvec4(v1)/v2;
316 }
317 
318 // Operations for blending fvec4s based on an ivec4.
319 
320 static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
321  return vbslq_f32(vreinterpretq_u32_s32(mask), v2, v1);
322 }
323 
324 // These are at the end since they involve other functions defined above.
325 
326 static inline fvec4 round(const fvec4& v) {
327  fvec4 shift(0x1.0p23f);
328  fvec4 absResult = (abs(v)+shift)-shift;
329  return blend(v, absResult, ivec4(0x7FFFFFFF));
330 }
331 
332 static inline fvec4 floor(const fvec4& v) {
333  fvec4 rounded = round(v);
334  return rounded + blend(0.0f, -1.0f, rounded>v);
335 }
336 
337 static inline fvec4 ceil(const fvec4& v) {
338  fvec4 rounded = round(v);
339  return rounded + blend(0.0f, 1.0f, rounded<v);
340 }
341 
342 #endif /*OPENMM_VECTORIZE_NEON_H_*/
fvec4 operator>=(const fvec4 &other) const
Definition: vectorize_neon.h:138
fvec4 operator+(const fvec4 &other) const
Definition: vectorize_neon.h:87
A four element vector of ints.
Definition: vectorize_neon.h:150
ivec4 operator==(const ivec4 &other) const
Definition: vectorize_neon.h:209
int operator[](int i) const
Definition: vectorize_neon.h:166
fvec4 operator-(const fvec4 &other) const
Definition: vectorize_neon.h:90
float32x4_t val
Definition: vectorize_neon.h:58
float operator[](int i) const
Definition: vectorize_neon.h:71
ivec4 operator<(const ivec4 &other) const
Definition: vectorize_neon.h:218
fvec4 operator<(const fvec4 &other) const
Definition: vectorize_neon.h:135
void operator/=(const fvec4 &other)
Definition: vectorize_neon.h:114
void operator*=(const fvec4 &other)
Definition: vectorize_neon.h:111
fvec4(const float *v)
Definition: vectorize_neon.h:67
ivec4 operator-(const ivec4 &other) const
Definition: vectorize_neon.h:185
void store(int *v) const
Definition: vectorize_neon.h:179
ivec4 operator>=(const ivec4 &other) const
Definition: vectorize_neon.h:221
fvec4(float32x4_t v)
Definition: vectorize_neon.h:66
A four element vector of floats.
Definition: vectorize_neon.h:56
ivec4 operator!=(const ivec4 &other) const
Definition: vectorize_neon.h:212
ivec4(int v1, int v2, int v3, int v4)
Definition: vectorize_neon.h:157
ivec4(const int *v)
Definition: vectorize_neon.h:162
fvec4 operator==(const fvec4 &other) const
Definition: vectorize_neon.h:126
fvec4 operator<=(const fvec4 &other) const
Definition: vectorize_neon.h:141
void store(float *v) const
Definition: vectorize_neon.h:84
void operator-=(const ivec4 &other)
Definition: vectorize_neon.h:194
void operator+=(const ivec4 &other)
Definition: vectorize_neon.h:191
ivec4 operator+(const ivec4 &other) const
Definition: vectorize_neon.h:182
fvec4()
Definition: vectorize_neon.h:60
ivec4 operator-() const
Definition: vectorize_neon.h:200
fvec4 operator/(const fvec4 &other) const
Definition: vectorize_neon.h:96
ivec4 operator|(const ivec4 &other) const
Definition: vectorize_neon.h:206
fvec4 operator!=(const fvec4 &other) const
Definition: vectorize_neon.h:129
ivec4 operator>(const ivec4 &other) const
Definition: vectorize_neon.h:215
ivec4()
Definition: vectorize_neon.h:155
void operator-=(const fvec4 &other)
Definition: vectorize_neon.h:108
fvec4 operator&(const fvec4 &other) const
Definition: vectorize_neon.h:120
fvec4 operator*(const fvec4 &other) const
Definition: vectorize_neon.h:93
fvec4(float v1, float v2, float v3, float v4)
Definition: vectorize_neon.h:62
fvec4 operator|(const fvec4 &other) const
Definition: vectorize_neon.h:123
fvec4(float v)
Definition: vectorize_neon.h:61
fvec4 operator>(const fvec4 &other) const
Definition: vectorize_neon.h:132
fvec4 operator-() const
Definition: vectorize_neon.h:117
void operator+=(const fvec4 &other)
Definition: vectorize_neon.h:105
ivec4(int v)
Definition: vectorize_neon.h:156
ivec4 operator*(const ivec4 &other) const
Definition: vectorize_neon.h:188
ivec4 operator<=(const ivec4 &other) const
Definition: vectorize_neon.h:224
ivec4(int32x4_t v)
Definition: vectorize_neon.h:161
ivec4 operator&(const ivec4 &other) const
Definition: vectorize_neon.h:203
void operator*=(const ivec4 &other)
Definition: vectorize_neon.h:197
int32x4_t val
Definition: vectorize_neon.h:153