OpenMM
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
vectorize8.h
1 #ifndef OPENMM_VECTORIZE8_H_
2 #define OPENMM_VECTORIZE8_H_
3 
4 /* -------------------------------------------------------------------------- *
5  * OpenMM *
6  * -------------------------------------------------------------------------- *
7  * This is part of the OpenMM molecular simulation toolkit originating from *
8  * Simbios, the NIH National Center for Physics-Based Simulation of *
9  * Biological Structures at Stanford, funded under the NIH Roadmap for *
10  * Medical Research, grant U54 GM072970. See https://simtk.org. *
11  * *
12  * Portions copyright (c) 2013-2014 Stanford University and the Authors. *
13  * Authors: Peter Eastman *
14  * Contributors: *
15  * *
16  * Permission is hereby granted, free of charge, to any person obtaining a *
17  * copy of this software and associated documentation files (the "Software"), *
18  * to deal in the Software without restriction, including without limitation *
19  * the rights to use, copy, modify, merge, publish, distribute, sublicense, *
20  * and/or sell copies of the Software, and to permit persons to whom the *
21  * Software is furnished to do so, subject to the following conditions: *
22  * *
23  * The above copyright notice and this permission notice shall be included in *
24  * all copies or substantial portions of the Software. *
25  * *
26  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
27  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
28  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
29  * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
30  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
31  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
32  * USE OR OTHER DEALINGS IN THE SOFTWARE. *
33  * -------------------------------------------------------------------------- */
34 
35 #include "vectorize.h"
36 #include <immintrin.h>
37 
38 // This file defines classes and functions to simplify vectorizing code with AVX.
39 
40 class ivec8;
41 
45 class fvec8 {
46 public:
47  __m256 val;
48 
49  fvec8() {}
50  fvec8(float v) : val(_mm256_set1_ps(v)) {}
51  fvec8(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8) : val(_mm256_set_ps(v8, v7, v6, v5, v4, v3, v2, v1)) {}
52  fvec8(__m256 v) : val(v) {}
53  fvec8(const float* v) : val(_mm256_loadu_ps(v)) {}
54  operator __m256() const {
55  return val;
56  }
57  fvec4 lowerVec() const {
58  return _mm256_castps256_ps128(val);
59  }
60  fvec4 upperVec() const {
61  return _mm256_extractf128_ps(val, 1);
62  }
63  void store(float* v) const {
64  _mm256_storeu_ps(v, val);
65  }
66  fvec8 operator+(const fvec8& other) const {
67  return _mm256_add_ps(val, other);
68  }
69  fvec8 operator-(const fvec8& other) const {
70  return _mm256_sub_ps(val, other);
71  }
72  fvec8 operator*(const fvec8& other) const {
73  return _mm256_mul_ps(val, other);
74  }
75  fvec8 operator/(const fvec8& other) const {
76  return _mm256_div_ps(val, other);
77  }
78  void operator+=(const fvec8& other) {
79  val = _mm256_add_ps(val, other);
80  }
81  void operator-=(const fvec8& other) {
82  val = _mm256_sub_ps(val, other);
83  }
84  void operator*=(const fvec8& other) {
85  val = _mm256_mul_ps(val, other);
86  }
87  void operator/=(const fvec8& other) {
88  val = _mm256_div_ps(val, other);
89  }
90  fvec8 operator-() const {
91  return _mm256_sub_ps(_mm256_set1_ps(0.0f), val);
92  }
93  fvec8 operator&(const fvec8& other) const {
94  return _mm256_and_ps(val, other);
95  }
96  fvec8 operator|(const fvec8& other) const {
97  return _mm256_or_ps(val, other);
98  }
99  fvec8 operator==(const fvec8& other) const {
100  return _mm256_cmp_ps(val, other, _CMP_EQ_OQ);
101  }
102  fvec8 operator!=(const fvec8& other) const {
103  return _mm256_cmp_ps(val, other, _CMP_NEQ_OQ);
104  }
105  fvec8 operator>(const fvec8& other) const {
106  return _mm256_cmp_ps(val, other, _CMP_GT_OQ);
107  }
108  fvec8 operator<(const fvec8& other) const {
109  return _mm256_cmp_ps(val, other, _CMP_LT_OQ);
110  }
111  fvec8 operator>=(const fvec8& other) const {
112  return _mm256_cmp_ps(val, other, _CMP_GE_OQ);
113  }
114  fvec8 operator<=(const fvec8& other) const {
115  return _mm256_cmp_ps(val, other, _CMP_LE_OQ);
116  }
117  operator ivec8() const;
118 };
119 
123 class ivec8 {
124 public:
125  __m256i val;
126 
127  ivec8() {}
128  ivec8(int v) : val(_mm256_set1_epi32(v)) {}
129  ivec8(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8) : val(_mm256_set_epi32(v8, v7, v6, v5, v4, v3, v2, v1)) {}
130  ivec8(__m256i v) : val(v) {}
131  ivec8(const int* v) : val(_mm256_loadu_si256((const __m256i*) v)) {}
132  operator __m256i() const {
133  return val;
134  }
135  ivec4 lowerVec() const {
136  return _mm256_castsi256_si128(val);
137  }
138  ivec4 upperVec() const {
139  return _mm256_extractf128_si256(val, 1);
140  }
141  void store(int* v) const {
142  _mm256_storeu_si256((__m256i*) v, val);
143  }
144  ivec8 operator&(const ivec8& other) const {
145  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val)));
146  }
147  ivec8 operator|(const ivec8& other) const {
148  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val)));
149  }
150  operator fvec8() const;
151 };
152 
153 // Conversion operators.
154 
155 inline fvec8::operator ivec8() const {
156  return _mm256_cvttps_epi32(val);
157 }
158 
159 inline ivec8::operator fvec8() const {
160  return _mm256_cvtepi32_ps(val);
161 }
162 
163 // Functions that operate on fvec8s.
164 
165 static inline fvec8 floor(const fvec8& v) {
166  return fvec8(_mm256_round_ps(v.val, 0x09));
167 }
168 
169 static inline fvec8 ceil(const fvec8& v) {
170  return fvec8(_mm256_round_ps(v.val, 0x0A));
171 }
172 
173 static inline fvec8 round(const fvec8& v) {
174  return fvec8(_mm256_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
175 }
176 
177 static inline fvec8 min(const fvec8& v1, const fvec8& v2) {
178  return fvec8(_mm256_min_ps(v1.val, v2.val));
179 }
180 
181 static inline fvec8 max(const fvec8& v1, const fvec8& v2) {
182  return fvec8(_mm256_max_ps(v1.val, v2.val));
183 }
184 
185 static inline fvec8 abs(const fvec8& v) {
186  static const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
187  return fvec8(_mm256_and_ps(v.val, mask));
188 }
189 
190 static inline fvec8 sqrt(const fvec8& v) {
191  return fvec8(_mm256_sqrt_ps(v.val));
192 }
193 
194 static inline float dot8(const fvec8& v1, const fvec8& v2) {
195  fvec8 result = _mm256_dp_ps(v1, v2, 0xF1);
196  return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());
197 }
198 
199 static inline void transpose(const fvec4& in1, const fvec4& in2, const fvec4& in3, const fvec4& in4, const fvec4& in5, const fvec4& in6, const fvec4& in7, const fvec4& in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
200  fvec4 i1 = in1, i2 = in2, i3 = in3, i4 = in4;
201  fvec4 i5 = in5, i6 = in6, i7 = in7, i8 = in8;
202  _MM_TRANSPOSE4_PS(i1, i2, i3, i4);
203  _MM_TRANSPOSE4_PS(i5, i6, i7, i8);
204 #ifdef _MSC_VER
205  // Visual Studio has a bug in _mm256_castps128_ps256, so we have to use the more expensive _mm256_insertf128_ps.
206  out1 = _mm256_insertf128_ps(out1, i1, 0);
207  out2 = _mm256_insertf128_ps(out2, i2, 0);
208  out3 = _mm256_insertf128_ps(out3, i3, 0);
209  out4 = _mm256_insertf128_ps(out4, i4, 0);
210 #else
211  out1 = _mm256_castps128_ps256(i1);
212  out2 = _mm256_castps128_ps256(i2);
213  out3 = _mm256_castps128_ps256(i3);
214  out4 = _mm256_castps128_ps256(i4);
215 #endif
216  out1 = _mm256_insertf128_ps(out1, i5, 1);
217  out2 = _mm256_insertf128_ps(out2, i6, 1);
218  out3 = _mm256_insertf128_ps(out3, i7, 1);
219  out4 = _mm256_insertf128_ps(out4, i8, 1);
220 }
221 
222 static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) {
223  out1 = in1.lowerVec();
224  out2 = in2.lowerVec();
225  out3 = in3.lowerVec();
226  out4 = in4.lowerVec();
227  _MM_TRANSPOSE4_PS(out1, out2, out3, out4);
228  out5 = in1.upperVec();
229  out6 = in2.upperVec();
230  out7 = in3.upperVec();
231  out8 = in4.upperVec();
232  _MM_TRANSPOSE4_PS(out5, out6, out7, out8);
233 }
234 
235 // Functions that operate on ivec8s.
236 
237 static inline bool any(const ivec8& v) {
238  return !_mm256_testz_si256(v, _mm256_set1_epi32(0xFFFFFFFF));
239 }
240 
241 // Mathematical operators involving a scalar and a vector.
242 
243 static inline fvec8 operator+(float v1, const fvec8& v2) {
244  return fvec8(v1)+v2;
245 }
246 
247 static inline fvec8 operator-(float v1, const fvec8& v2) {
248  return fvec8(v1)-v2;
249 }
250 
251 static inline fvec8 operator*(float v1, const fvec8& v2) {
252  return fvec8(v1)*v2;
253 }
254 
255 static inline fvec8 operator/(float v1, const fvec8& v2) {
256  return fvec8(v1)/v2;
257 }
258 
259 // Operations for blending fvec8s based on an ivec8.
260 
261 static inline fvec8 blend(const fvec8& v1, const fvec8& v2, const ivec8& mask) {
262  return fvec8(_mm256_blendv_ps(v1.val, v2.val, _mm256_castsi256_ps(mask.val)));
263 }
264 
265 #endif /*OPENMM_VECTORIZE8_H_*/
fvec8 operator|(const fvec8 &other) const
Definition: vectorize8.h:96
fvec8 operator!=(const fvec8 &other) const
Definition: vectorize8.h:102
A four element vector of ints.
Definition: vectorize.h:122
fvec8 operator*(const fvec8 &other) const
Definition: vectorize8.h:72
fvec8 operator/(const fvec8 &other) const
Definition: vectorize8.h:75
ivec4 lowerVec() const
Definition: vectorize8.h:135
void operator/=(const fvec8 &other)
Definition: vectorize8.h:87
fvec4 lowerVec() const
Definition: vectorize8.h:57
An eight element vector of ints.
Definition: vectorize8.h:123
fvec8()
Definition: vectorize8.h:49
fvec4 upperVec() const
Definition: vectorize8.h:60
void operator*=(const fvec8 &other)
Definition: vectorize8.h:84
fvec8 operator+(const fvec8 &other) const
Definition: vectorize8.h:66
fvec8 operator>(const fvec8 &other) const
Definition: vectorize8.h:105
fvec8(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8)
Definition: vectorize8.h:51
A four element vector of floats.
Definition: vectorize.h:45
void store(int *v) const
Definition: vectorize8.h:141
ivec8(int v)
Definition: vectorize8.h:128
ivec4 upperVec() const
Definition: vectorize8.h:138
ivec8(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8)
Definition: vectorize8.h:129
fvec8(float v)
Definition: vectorize8.h:50
fvec8 operator<=(const fvec8 &other) const
Definition: vectorize8.h:114
An eight element vector of floats.
Definition: vectorize8.h:45
ivec8 operator&(const ivec8 &other) const
Definition: vectorize8.h:144
fvec8 operator-(const fvec8 &other) const
Definition: vectorize8.h:69
fvec8 operator>=(const fvec8 &other) const
Definition: vectorize8.h:111
__m256i val
Definition: vectorize8.h:125
__m256 val
Definition: vectorize8.h:47
fvec8 operator==(const fvec8 &other) const
Definition: vectorize8.h:99
ivec8 operator|(const ivec8 &other) const
Definition: vectorize8.h:147
ivec8(__m256i v)
Definition: vectorize8.h:130
void operator+=(const fvec8 &other)
Definition: vectorize8.h:78
ivec8()
Definition: vectorize8.h:127
fvec8 operator&(const fvec8 &other) const
Definition: vectorize8.h:93
void operator-=(const fvec8 &other)
Definition: vectorize8.h:81
fvec8 operator-() const
Definition: vectorize8.h:90
ivec8(const int *v)
Definition: vectorize8.h:131
void store(float *v) const
Definition: vectorize8.h:63
fvec8(const float *v)
Definition: vectorize8.h:53
fvec8(__m256 v)
Definition: vectorize8.h:52
fvec8 operator<(const fvec8 &other) const
Definition: vectorize8.h:108